mirror of
https://github.com/kvcache-ai/ktransformers.git
synced 2026-03-14 18:37:23 +00:00
update kt-kernel
This commit is contained in:
54
kt-kernel/.githooks/commit-msg
Executable file
54
kt-kernel/.githooks/commit-msg
Executable file
@@ -0,0 +1,54 @@
|
||||
#!/bin/sh
|
||||
# commit-msg hook to enforce Conventional Commits (https://www.conventionalcommits.org/)
|
||||
# This script checks the commit message subject (first line) for a conventional commit format.
|
||||
# If the message does not conform, the hook exits non-zero to block the commit.
|
||||
|
||||
# Read the commit message (first line)
|
||||
if [ -z "$1" ]; then
|
||||
echo "commit-msg hook: no message file provided" >&2
|
||||
exit 0
|
||||
fi
|
||||
|
||||
MSG_FILE="$1"
|
||||
read -r FIRST_LINE < "$MSG_FILE" || FIRST_LINE=""
|
||||
|
||||
# Trim leading/trailing whitespace
|
||||
FIRST_LINE="$(echo "$FIRST_LINE" | sed -e 's/^[ \t]*//' -e 's/[ \t]*$//')"
|
||||
|
||||
# Allow empty message (let git handle it), or allow merges/reverts
|
||||
case "$FIRST_LINE" in
|
||||
Merge:*|merge:*|Revert:*|revert:*)
|
||||
exit 0
|
||||
;;
|
||||
esac
|
||||
|
||||
# Conventional Commit regex (POSIX ERE)
|
||||
# [type](scope)!?: subject
|
||||
# types: feat|fix|docs|style|refactor|perf|test|build|ci|chore|revert|wip
|
||||
# scope: any chars except )
|
||||
|
||||
regex='^\[(feat|fix|docs|style|refactor|perf|test|build|ci|chore|revert|wip)\](\([^\)]+\))?(!)?: .+'
|
||||
|
||||
printf "%s" "$FIRST_LINE" | grep -E "$regex" >/dev/null 2>&1
|
||||
if [ $? -eq 0 ]; then
|
||||
exit 0
|
||||
fi
|
||||
|
||||
cat <<'EOF' >&2
|
||||
ERROR: Commit message does not follow Conventional Commits.
|
||||
|
||||
Expected format:
|
||||
[type](scope)?: subject
|
||||
|
||||
Examples:
|
||||
[feat]: add new feature
|
||||
[fix(parser)]: handle edge case
|
||||
[docs]!: update API docs (breaking change)
|
||||
|
||||
Allowed types: feat, fix, docs, style, refactor, perf, test, build, ci, chore, revert, wip
|
||||
|
||||
You can bypass this hook locally by running:
|
||||
git commit --no-verify
|
||||
EOF
|
||||
|
||||
exit 1
|
||||
63
kt-kernel/.githooks/pre-commit
Executable file
63
kt-kernel/.githooks/pre-commit
Executable file
@@ -0,0 +1,63 @@
|
||||
#!/usr/bin/bash
|
||||
# Pre-commit hook: run clang-format via CMake 'format' target and Black for Python before allowing commit.
|
||||
# If formatting makes changes, stage them and abort so user can review.
|
||||
set -euo pipefail
|
||||
|
||||
REPO_ROOT="$(git rev-parse --show-toplevel)"
|
||||
BUILD_DIR="$REPO_ROOT/build"
|
||||
FORMAT_TARGET="format"
|
||||
CLANG_FORMAT_BIN="${CLANG_FORMAT_BIN:-clang-format}"
|
||||
BLACK_BIN="${BLACK_BIN:-black}"
|
||||
|
||||
# Simple check clang-format present (optional)
|
||||
# clang-format optional: if missing, skip C/C++ formatting
|
||||
if ! command -v "$CLANG_FORMAT_BIN" >/dev/null 2>&1; then
|
||||
echo "[pre-commit] clang-format not found (looked for $CLANG_FORMAT_BIN). Skipping C/C++ format." >&2
|
||||
fi
|
||||
|
||||
# black optional: if missing, skip Python formatting
|
||||
if ! command -v "$BLACK_BIN" >/dev/null 2>&1; then
|
||||
echo "[pre-commit] black not found (looked for $BLACK_BIN). Skipping Python format." >&2
|
||||
fi
|
||||
|
||||
# Configure build directory if missing (quiet)
|
||||
if [ ! -d "$BUILD_DIR" ] || [ ! -f "$BUILD_DIR/Makefile" ] && [ ! -f "$BUILD_DIR/build.ninja" ]; then
|
||||
echo "[pre-commit] configuring project (cmake) ..." >&2
|
||||
cmake -S "$REPO_ROOT" -B "$BUILD_DIR" >/dev/null
|
||||
fi
|
||||
|
||||
# Run format target (prefer ninja if present)
|
||||
# Run clang-format target when available and tool present
|
||||
if command -v "$CLANG_FORMAT_BIN" >/dev/null 2>&1; then
|
||||
if [ -f "$BUILD_DIR/build.ninja" ]; then
|
||||
(cd "$BUILD_DIR" && ninja -k0 "$FORMAT_TARGET" >/dev/null)
|
||||
else
|
||||
(cd "$BUILD_DIR" && make "$FORMAT_TARGET")
|
||||
fi
|
||||
fi
|
||||
|
||||
# Run black on staged python files (or entire repo if you prefer)
|
||||
if command -v "$BLACK_BIN" >/dev/null 2>&1; then
|
||||
# Get staged python files; if none, skip
|
||||
PY_FILES=$(git diff --cached --name-only --diff-filter=ACM | grep -E '\.py$' || true)
|
||||
if [ -n "$PY_FILES" ]; then
|
||||
echo "[pre-commit] running black on staged python files..." >&2
|
||||
$BLACK_BIN $PY_FILES
|
||||
else
|
||||
# Optionally format all python files; comment out if not desired
|
||||
# $BLACK_BIN "$REPO_ROOT"
|
||||
:
|
||||
fi
|
||||
fi
|
||||
|
||||
# Stage any formatting changes for tracked files
|
||||
if ! git diff --quiet --exit-code; then
|
||||
echo "[pre-commit] Formatting applied; updating index." >&2
|
||||
# Add only modified tracked files (exclude untracked new files not staged yet unless user staged them)
|
||||
git add -u
|
||||
echo "[pre-commit] Re-run git commit to proceed after reviewing changes." >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "[pre-commit] format OK." >&2
|
||||
exit 0
|
||||
6
kt-kernel/.gitmodules
vendored
Normal file
6
kt-kernel/.gitmodules
vendored
Normal file
@@ -0,0 +1,6 @@
|
||||
[submodule "pybind11"]
|
||||
path = third_party/pybind11
|
||||
url = https://github.com/pybind/pybind11.git
|
||||
[submodule "llama.cpp"]
|
||||
path = third_party/llama.cpp
|
||||
url = https://github.com/ggerganov/llama.cpp.git
|
||||
@@ -11,16 +11,34 @@ High-performance kernel operations for KTransformers, featuring CPU-optimized Mo
|
||||
|
||||
## Installation
|
||||
|
||||
### Prerequisites
|
||||
|
||||
First, initialize git submodules:
|
||||
```bash
|
||||
git submodule update --init --recursive
|
||||
```
|
||||
|
||||
### Standard Installation
|
||||
```bash
|
||||
pip install .
|
||||
```
|
||||
|
||||
All dependencies (torch, safetensors, compressed-tensors, numpy) will be automatically installed from `pyproject.toml`.
|
||||
|
||||
### Editable Installation (Development)
|
||||
```bash
|
||||
pip install -e .
|
||||
```
|
||||
|
||||
### Optional: Pre-install Dependencies
|
||||
|
||||
If you encounter network issues or prefer to install dependencies separately, you can optionally use:
|
||||
```bash
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
**Note**: This step is **optional**. If your environment already has torch and other required packages, you can skip this and directly run `pip install .`
|
||||
|
||||
## Usage
|
||||
|
||||
```python
|
||||
|
||||
@@ -19,10 +19,13 @@ classifiers = [
|
||||
]
|
||||
requires-python = ">=3.8"
|
||||
dependencies = [
|
||||
# Install black by default so git hook can use it
|
||||
# Core dependencies
|
||||
"torch>=2.0.0",
|
||||
"safetensors>=0.4.0",
|
||||
"compressed-tensors>=0.7.0",
|
||||
"numpy>=1.24.0",
|
||||
# Development dependencies
|
||||
"black>=25.9.0",
|
||||
"torch",
|
||||
"safetensors",
|
||||
]
|
||||
|
||||
# No optional dev group needed for formatting; using custom git hooks instead of pre-commit
|
||||
|
||||
12
kt-kernel/requirements.txt
Normal file
12
kt-kernel/requirements.txt
Normal file
@@ -0,0 +1,12 @@
|
||||
# Optional: Install these if not already available in your environment
|
||||
# These dependencies will be automatically installed when running `pip install .`
|
||||
# You can skip this file if you already have these packages installed
|
||||
|
||||
# Core dependencies (minimum versions)
|
||||
torch>=2.0.0
|
||||
safetensors>=0.4.0
|
||||
compressed-tensors>=0.7.0
|
||||
numpy>=1.24.0
|
||||
|
||||
# Development dependencies
|
||||
black>=25.9.0
|
||||
1
kt-kernel/third_party/llamafile/README.md
vendored
Normal file
1
kt-kernel/third_party/llamafile/README.md
vendored
Normal file
@@ -0,0 +1 @@
|
||||
The code in this folder is copied from [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile). Special thanks to the Mozilla-Ocho team.
|
||||
25
kt-kernel/third_party/llamafile/bench.h
vendored
Normal file
25
kt-kernel/third_party/llamafile/bench.h
vendored
Normal file
@@ -0,0 +1,25 @@
|
||||
// Adapted from
|
||||
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/bench.h
|
||||
// Copyrigth 2024 Mozilla Foundation.
|
||||
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
|
||||
|
||||
// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
|
||||
// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
|
||||
#pragma once
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
#include "micros.h"
|
||||
|
||||
#define BENCH(x) \
|
||||
do { \
|
||||
x; \
|
||||
__asm__ volatile("" ::: "memory"); \
|
||||
long long start = micros(); \
|
||||
for (int i = 0; i < ITERATIONS; ++i) { \
|
||||
__asm__ volatile("" ::: "memory"); \
|
||||
x; \
|
||||
__asm__ volatile("" ::: "memory"); \
|
||||
} \
|
||||
printf("%9lld us %s\n", (micros() - start + ITERATIONS - 1) / ITERATIONS, #x); \
|
||||
} while (0)
|
||||
8
kt-kernel/third_party/llamafile/flags.cpp
vendored
Normal file
8
kt-kernel/third_party/llamafile/flags.cpp
vendored
Normal file
@@ -0,0 +1,8 @@
|
||||
// Adapted from
|
||||
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/flags.cpp
|
||||
// Copyrigth 2024 Mozilla Foundation.
|
||||
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
|
||||
|
||||
#include "flags.h"
|
||||
|
||||
bool FLAG_precise = false;
|
||||
8
kt-kernel/third_party/llamafile/flags.h
vendored
Normal file
8
kt-kernel/third_party/llamafile/flags.h
vendored
Normal file
@@ -0,0 +1,8 @@
|
||||
// Adapted from
|
||||
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/flags.cpp
|
||||
// Copyrigth 2024 Mozilla Foundation.
|
||||
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
extern bool FLAG_precise;
|
||||
4869
kt-kernel/third_party/llamafile/iqk_mul_mat.inc
vendored
Normal file
4869
kt-kernel/third_party/llamafile/iqk_mul_mat.inc
vendored
Normal file
File diff suppressed because it is too large
Load Diff
8
kt-kernel/third_party/llamafile/iqk_mul_mat_amd_avx2.cpp
vendored
Normal file
8
kt-kernel/third_party/llamafile/iqk_mul_mat_amd_avx2.cpp
vendored
Normal file
@@ -0,0 +1,8 @@
|
||||
// Adapted from
|
||||
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/iqk_mul_mat_amd_avx2.cpp
|
||||
// Copyrigth 2024 Iwan Kawrakow.
|
||||
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
|
||||
|
||||
#if defined(__x86_64__) || defined(_M_X64)
|
||||
#include "iqk_mul_mat.inc"
|
||||
#endif // __x86_64__
|
||||
10
kt-kernel/third_party/llamafile/iqk_mul_mat_amd_zen4.cpp
vendored
Normal file
10
kt-kernel/third_party/llamafile/iqk_mul_mat_amd_zen4.cpp
vendored
Normal file
@@ -0,0 +1,10 @@
|
||||
// Adapted from
|
||||
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/iqk_mul_mat_amd_zen4.cpp
|
||||
// Copyrigth 2024 Iwan Kawrakow.
|
||||
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
|
||||
|
||||
#if defined(__x86_64__) || defined(_M_X64)
|
||||
#define iqk_mul_mat iqk_mul_mat_zen4
|
||||
#define iqk_mul_mat_moe iqk_mul_mat_moe_zen4
|
||||
#include "iqk_mul_mat.inc"
|
||||
#endif // __x86_64__
|
||||
3063
kt-kernel/third_party/llamafile/iqk_mul_mat_arm.inc
vendored
Normal file
3063
kt-kernel/third_party/llamafile/iqk_mul_mat_arm.inc
vendored
Normal file
File diff suppressed because it is too large
Load Diff
10
kt-kernel/third_party/llamafile/iqk_mul_mat_arm82.cpp
vendored
Normal file
10
kt-kernel/third_party/llamafile/iqk_mul_mat_arm82.cpp
vendored
Normal file
@@ -0,0 +1,10 @@
|
||||
// Adapted from
|
||||
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/iqk_mul_mat_arm82.cpp
|
||||
// Copyrigth 2024 Iwan Kawrakow.
|
||||
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
|
||||
|
||||
#ifdef __aarch64__
|
||||
#define iqk_mul_mat iqk_mul_mat_arm82
|
||||
#define iqk_mul_mat_moe iqk_mul_mat_moe_arm82
|
||||
#include "iqk_mul_mat_arm.inc"
|
||||
#endif // __aarch64__
|
||||
14
kt-kernel/third_party/llamafile/macros.h
vendored
Normal file
14
kt-kernel/third_party/llamafile/macros.h
vendored
Normal file
@@ -0,0 +1,14 @@
|
||||
// Adapted from
|
||||
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/macros.h
|
||||
// Copyrigth 2024 Mozilla Foundation.
|
||||
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
|
||||
|
||||
// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
|
||||
// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
|
||||
#pragma once
|
||||
|
||||
#define MIN(X, Y) ((Y) > (X) ? (X) : (Y))
|
||||
#define MAX(X, Y) ((Y) < (X) ? (X) : (Y))
|
||||
#define CEIL_DIV(M, N) (((M) + (N) - 1) / (N))
|
||||
#define ROUNDUP(X, K) (((X) + (K) - 1) & -(K))
|
||||
#define ARRAYLEN(A) ((sizeof(A) / sizeof(*(A))) / ((unsigned)!(sizeof(A) % sizeof(*(A)))))
|
||||
41
kt-kernel/third_party/llamafile/micros.h
vendored
Normal file
41
kt-kernel/third_party/llamafile/micros.h
vendored
Normal file
@@ -0,0 +1,41 @@
|
||||
// Adapted from
|
||||
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/micros.h
|
||||
// Copyrigth 2024 Mozilla Foundation.
|
||||
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
|
||||
|
||||
// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
|
||||
// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
|
||||
#pragma once
|
||||
|
||||
#include <ctime>
|
||||
|
||||
#ifndef _WIN32
|
||||
#include <unistd.h>
|
||||
#else
|
||||
#include <windows.h>
|
||||
#endif
|
||||
|
||||
#ifdef _WIN32
|
||||
static long long GetQueryPerformanceFrequency() {
|
||||
LARGE_INTEGER t;
|
||||
QueryPerformanceFrequency(&t);
|
||||
return t.QuadPart;
|
||||
}
|
||||
static long long GetQueryPerformanceCounter() {
|
||||
LARGE_INTEGER t;
|
||||
QueryPerformanceCounter(&t);
|
||||
return t.QuadPart;
|
||||
}
|
||||
#endif
|
||||
|
||||
static long long micros(void) {
|
||||
#ifndef _WIN32
|
||||
struct timespec ts;
|
||||
clock_gettime(CLOCK_REALTIME, &ts);
|
||||
return ts.tv_sec * 1000000 + (ts.tv_nsec + 999) / 1000;
|
||||
#else
|
||||
static long long timer_freq = GetQueryPerformanceFrequency();
|
||||
static long long timer_start = GetQueryPerformanceCounter();
|
||||
return ((GetQueryPerformanceCounter() - timer_start) * 1000000) / timer_freq;
|
||||
#endif
|
||||
}
|
||||
59
kt-kernel/third_party/llamafile/numba.h
vendored
Normal file
59
kt-kernel/third_party/llamafile/numba.h
vendored
Normal file
@@ -0,0 +1,59 @@
|
||||
// Adapted from
|
||||
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/numba.h
|
||||
// Copyrigth 2024 Mozilla Foundation.
|
||||
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
inline int rand32(void) {
|
||||
static unsigned long long lcg = 1;
|
||||
lcg *= 6364136223846793005;
|
||||
lcg += 1442695040888963407;
|
||||
return lcg >> 32;
|
||||
}
|
||||
|
||||
inline int popcount(unsigned x) {
|
||||
x = x - ((x >> 1) & 0x55555555);
|
||||
x = ((x >> 2) & 0x33333333) + (x & 0x33333333);
|
||||
x = (x + (x >> 4)) & 0x0F0F0F0F;
|
||||
x = (x + (x >> 16));
|
||||
return (x + (x >> 8)) & 0x0000003F;
|
||||
}
|
||||
|
||||
inline int hamming(int x, int y) {
|
||||
return popcount(x ^ y);
|
||||
}
|
||||
|
||||
inline float float01(unsigned x) { // (0,1)
|
||||
return 1.f / 8388608 * ((x >> 9) + .5f);
|
||||
}
|
||||
|
||||
inline float numba(void) { // (-10,10)
|
||||
return float01(rand32()) * 2.f - 1.f;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void randomize(T* A, int n) {
|
||||
for (int i = 0; i < n; ++i)
|
||||
A[i] = numba();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void randomize(int m, int n, T* A, int lda) {
|
||||
for (int j = 0; j < n; ++j)
|
||||
for (int i = 0; i < m; ++i)
|
||||
A[lda * j + i] = numba();
|
||||
}
|
||||
|
||||
template <typename T, typename U>
|
||||
void broadcast(T* A, int n, U x) {
|
||||
for (int i = 0; i < n; ++i)
|
||||
A[i] = x;
|
||||
}
|
||||
|
||||
template <typename T, typename U>
|
||||
void broadcast(int m, int n, T* A, int lda, U x) {
|
||||
for (int j = 0; j < n; ++j)
|
||||
for (int i = 0; i < m; ++i)
|
||||
A[lda * j + i] = x;
|
||||
}
|
||||
204
kt-kernel/third_party/llamafile/sgemm.cpp
vendored
Normal file
204
kt-kernel/third_party/llamafile/sgemm.cpp
vendored
Normal file
@@ -0,0 +1,204 @@
|
||||
// Adapted from
|
||||
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/sgemm.cpp
|
||||
// Copyrigth 2024 Mozilla Foundation.
|
||||
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
|
||||
|
||||
// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
|
||||
// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
|
||||
//
|
||||
// Copyright 2024 Mozilla Foundation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "sgemm.h"
|
||||
// #include <cosmo.h>
|
||||
// #include <cpuid.h>
|
||||
// #include <libc/sysv/consts/hwcap.h>
|
||||
#include <stdio.h>
|
||||
// #include <sys/auxv.h>
|
||||
#include <cassert>
|
||||
// #include "llamafile.h"
|
||||
|
||||
static const struct GemmFuncs {
|
||||
bool (*sgemm)(long, long, long, const void*, long, const void*, long, void*, long, int, int, int, int, int, int, int);
|
||||
bool (*mixmul)(const struct ggml_compute_params*, const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*, struct ggml_tensor*);
|
||||
bool (*iqk_mixmul)(long, long, long, int, int, const void*, const void*, float*, long, long, const void*, int, int);
|
||||
// typeof(llamafile_sgemm)* sgemm;
|
||||
// typeof(llamafile_mixmul)* mixmul;
|
||||
// typeof(llamafile_mixmul_iqk)* iqk_mixmul = iqk_mul_mat_moe_unsupported;
|
||||
GemmFuncs() {
|
||||
#if defined(__x86_64__) || defined(_M_X64)
|
||||
// if (X86_HAVE(AVX)) {
|
||||
// if (X86_HAVE(FMA)) {
|
||||
// if (X86_HAVE(AVX2)) {
|
||||
// if (X86_HAVE(AVX512F)) {
|
||||
// if (X86_HAVE(AVX512VL) && //
|
||||
// X86_HAVE(AVX512BW) && //
|
||||
// X86_HAVE(AVX512DQ) && //
|
||||
// X86_HAVE(AVX512_VNNI) && //
|
||||
// X86_HAVE(AVX512_BF16)) {
|
||||
// // AMD Zen4+ (2023-)
|
||||
// sgemm = llamafile_sgemm_amd_zen4;
|
||||
// mixmul = llamafile_mixmul_amd_zen4;
|
||||
// iqk_mixmul = iqk_mul_mat_moe_zen4;
|
||||
// } else {
|
||||
// // Intel Xeon Skylake+ (2015-)
|
||||
// sgemm = llamafile_sgemm_amd_avx512f;
|
||||
// mixmul = llamafile_mixmul_amd_avx512f;
|
||||
// iqk_mixmul = iqk_mul_mat_moe;
|
||||
// }
|
||||
// } else if (X86_HAVE(AVXVNNI)) {
|
||||
// // Intel Alderlake (2021-)
|
||||
// sgemm = llamafile_sgemm_amd_avxvnni;
|
||||
// mixmul = llamafile_mixmul_amd_avxvnni;
|
||||
// iqk_mixmul = iqk_mul_mat_moe;
|
||||
// } else {
|
||||
// // Intel Haswell/Broadwell/Skylake (2013-2020)
|
||||
// // AMD Excavator (2015-2022)
|
||||
// sgemm = llamafile_sgemm_amd_avx2;
|
||||
// mixmul = llamafile_mixmul_amd_avx2;
|
||||
// if (X86_HAVE(F16C))
|
||||
// iqk_mixmul = iqk_mul_mat_moe;
|
||||
// }
|
||||
// } else {
|
||||
// // AMD Piledriver (2011-2014)
|
||||
// sgemm = llamafile_sgemm_amd_fma;
|
||||
// mixmul = llamafile_mixmul_amd_fma;
|
||||
// if (X86_HAVE(F16C))
|
||||
// iqk_mixmul = iqk_mul_mat_moe;
|
||||
// }
|
||||
// } else {
|
||||
// // Intel Sandybridge/Ivybridge (2010-2012)
|
||||
// // AMD Bulldozer (2011)
|
||||
// sgemm = llamafile_sgemm_amd_avx;
|
||||
// mixmul = llamafile_mixmul_amd_avx;
|
||||
// }
|
||||
// } else {
|
||||
// // AMD K8/Barcelona (2003-2010)
|
||||
// // Intel Core/Nehalem (2006-2009)
|
||||
// sgemm = llamafile_sgemm_unsupported;
|
||||
// mixmul = llamafile_mixmul_unsupported;
|
||||
// }
|
||||
|
||||
#if defined(__AVX__)
|
||||
#if defined(__FMA__) || (defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__)))
|
||||
#if defined(__AVX2__)
|
||||
#if defined(__AVX512F__)
|
||||
#if defined(__AVX512VL__) && defined(__AVX512BW__) && defined(__AVX512DQ__) && defined(__AVX512VNNI__) && defined(__AVX512BF16__)
|
||||
// AMD Zen4+ (2023-)
|
||||
sgemm = llamafile_sgemm_amd_zen4;
|
||||
mixmul = llamafile_mixmul_amd_zen4;
|
||||
iqk_mixmul = iqk_mul_mat_moe_zen4;
|
||||
#else
|
||||
// Intel Xeon Skylake+ (2015-)
|
||||
sgemm = llamafile_sgemm_amd_avx512f;
|
||||
mixmul = llamafile_mixmul_amd_avx512f;
|
||||
iqk_mixmul = iqk_mul_mat_moe;
|
||||
#endif
|
||||
#elif defined(__AVXVNNI__)
|
||||
// Intel Alderlake (2021-)
|
||||
sgemm = llamafile_sgemm_amd_avxvnni;
|
||||
mixmul = llamafile_mixmul_amd_avxvnni;
|
||||
iqk_mixmul = iqk_mul_mat_moe;
|
||||
#else
|
||||
// Intel Haswell/Broadwell/Skylake (2013-2020)
|
||||
// AMD Excavator (2015-2022)
|
||||
sgemm = llamafile_sgemm_amd_avx2;
|
||||
mixmul = llamafile_mixmul_amd_avx2;
|
||||
#if defined(__F16C__)
|
||||
iqk_mixmul = iqk_mul_mat_moe;
|
||||
#endif
|
||||
#endif
|
||||
#else
|
||||
// AMD Piledriver (2011-2014)
|
||||
sgemm = llamafile_sgemm_amd_fma;
|
||||
mixmul = llamafile_mixmul_amd_fma;
|
||||
#if defined(__F16C__)
|
||||
iqk_mixmul = iqk_mul_mat_moe;
|
||||
#endif
|
||||
#endif
|
||||
#else
|
||||
// Intel Sandybridge/Ivybridge (2010-2012)
|
||||
// AMD Bulldozer (2011)
|
||||
sgemm = llamafile_sgemm_amd_avx;
|
||||
mixmul = llamafile_mixmul_amd_avx;
|
||||
#endif
|
||||
#else
|
||||
// AMD K8/Barcelona (2003-2010)
|
||||
// Intel Core/Nehalem (2006-2009)
|
||||
sgemm = llamafile_sgemm_unsupported;
|
||||
mixmul = llamafile_mixmul_unsupported;
|
||||
#endif
|
||||
|
||||
#elif defined(__aarch64__)
|
||||
// long hwcap = getauxval(AT_HWCAP);
|
||||
// if ((hwcap & HWCAP_FPHP) && // fp16 scalar isa (ID_AA64PFR0_EL1.FP == 1)
|
||||
// (hwcap & HWCAP_ASIMDHP) && // fp16 vector isa (ID_AA64PFR0_EL1.AdvSIMD == 1)
|
||||
// (hwcap & HWCAP_ASIMDDP)) { // dotprod isa (ID_AA64ISAR0_EL1.DP == 1)
|
||||
// // e.g. Apple M1, Raspberry Pi 5
|
||||
sgemm = llamafile_sgemm_arm82;
|
||||
mixmul = llamafile_mixmul_arm82;
|
||||
iqk_mixmul = iqk_mul_mat_moe_arm82;
|
||||
// } else {
|
||||
// ARM64 baseline ISA
|
||||
// sgemm = llamafile_sgemm_arm80;
|
||||
// mixmul = llamafile_mixmul_arm80;
|
||||
// }
|
||||
#else
|
||||
sgemm = llamafile_sgemm_unsupported;
|
||||
mixmul = llamafile_mixmul_unsupported;
|
||||
#endif
|
||||
}
|
||||
} funcs;
|
||||
|
||||
/**
|
||||
* Performs optimized matrix multiplication on CPU.
|
||||
*
|
||||
* This subroutine may compute C = Aᵀ * B with column major ordering.
|
||||
* Despite its name, this isn't a generalized implementation. Work is
|
||||
* only performed when a handwritten kernel is written and available.
|
||||
* Otherwise the caller should fall back to a general matmul routine.
|
||||
*
|
||||
* @param m is rows in `A` and `C`
|
||||
* @param n is cols in `B` and `C`
|
||||
* @param k is cols in `A` and rows in `B`
|
||||
* @param A is first input matrix (always transposed)
|
||||
* @param lda is row stride of `A`
|
||||
* @param B is second input matrix (never transposed)
|
||||
* @param ldb is row stride of `B`
|
||||
* @param C is input/output array of output matrices
|
||||
* @param ldc is row stride of `C`
|
||||
* @param ith is thread id (must be less than `nth`)
|
||||
* @param nth is number of threads (must be greater than zero)
|
||||
* @param task is GGML task type
|
||||
* @param Atype is GGML data type of `A`
|
||||
* @param Btype is GGML data type of `B`
|
||||
* @param Ctype is GGML data type of `C`
|
||||
* @param precision may be used to control the internal compute type
|
||||
* @return true if this function was able to service the matmul request
|
||||
*/
|
||||
bool llamafile_sgemm(long m, long n, long k, const void* A, long lda, const void* B, long ldb, void* C, long ldc, int ith, int nth, int task, int Atype, int Btype, int Ctype, int precision) {
|
||||
return funcs.sgemm(m, n, k, A, lda, B, ldb, C, ldc, ith, nth, task, Atype, Btype, Ctype,
|
||||
precision);
|
||||
}
|
||||
|
||||
/**
|
||||
* Performs "mixture of experts" tensor multiplication on CPU.
|
||||
*/
|
||||
bool llamafile_mixmul(const ggml_compute_params* params, const ggml_tensor* weights, const ggml_tensor* thought, const ggml_tensor* plan, ggml_tensor* result) {
|
||||
return funcs.mixmul(params, weights, thought, plan, result);
|
||||
}
|
||||
|
||||
bool llamafile_mixmul_iqk(long Nx, long Ny, long ne00, int ne11, int typeA, const void* A, const void* B, float* C, long nb1, long nb2, const void* vrow_mapping, int ith, int nth) {
|
||||
return funcs.iqk_mixmul(Nx, Ny, ne00, ne11, typeA, A, B, C, nb1, nb2, vrow_mapping, ith, nth);
|
||||
}
|
||||
92
kt-kernel/third_party/llamafile/sgemm.h
vendored
Normal file
92
kt-kernel/third_party/llamafile/sgemm.h
vendored
Normal file
@@ -0,0 +1,92 @@
|
||||
// Adapted from
|
||||
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/sgemm.h
|
||||
// Copyrigth 2024 Mozilla Foundation.
|
||||
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
|
||||
|
||||
#pragma once
|
||||
#include <stdbool.h>
|
||||
#include <cstddef>
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
struct ggml_tensor;
|
||||
struct ggml_compute_params;
|
||||
#ifdef __aarch64__
|
||||
|
||||
bool iqk_mul_mat(long, long, long, int, const void*, const void*, float*, long, int, int);
|
||||
bool iqk_mul_mat_zen4(long, long, long, int, const void*, const void*, float*, long, int, int);
|
||||
bool iqk_mul_mat_arm82(long, long, long, int, const void*, const void*, float*, long, int, int);
|
||||
|
||||
bool iqk_mul_mat_moe(long, long, long, int, int, const void*, const void*, float*, long, long, const void*, int, int);
|
||||
bool iqk_mul_mat_moe_zen4(long, long, long, int, int, const void*, const void*, float*, long, long, const void*, int, int);
|
||||
bool iqk_mul_mat_moe_arm82(long, long, long, int, int, const void*, const void*, float*, long, long, const void*, int, int);
|
||||
bool iqk_mul_mat_moe_unsupported(long, long, long, int, int, const void*, const void*, float*, long, long, const void*, int, int);
|
||||
|
||||
bool llamafile_sgemm(long, long, long, const void*, long, const void*, long, void*, long, int, int, int, int, int, int, int);
|
||||
bool llamafile_mixmul(const struct ggml_compute_params*, const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*, struct ggml_tensor*);
|
||||
size_t llamafile_mixmul_needs(const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*);
|
||||
|
||||
bool llamafile_sgemm_unsupported(long, long, long, const void*, long, const void*, long, void*, long, int, int, int, int, int, int, int);
|
||||
bool llamafile_sgemm_amd_avx(long, long, long, const void*, long, const void*, long, void*, long, int, int, int, int, int, int, int);
|
||||
bool llamafile_sgemm_amd_fma(long, long, long, const void*, long, const void*, long, void*, long, int, int, int, int, int, int, int);
|
||||
bool llamafile_sgemm_amd_avx2(long, long, long, const void*, long, const void*, long, void*, long, int, int, int, int, int, int, int);
|
||||
bool llamafile_sgemm_amd_avxvnni(long, long, long, const void*, long, const void*, long, void*, long, int, int, int, int, int, int, int);
|
||||
bool llamafile_sgemm_amd_avx512f(long, long, long, const void*, long, const void*, long, void*, long, int, int, int, int, int, int, int);
|
||||
bool llamafile_sgemm_amd_zen4(long, long, long, const void*, long, const void*, long, void*, long, int, int, int, int, int, int, int);
|
||||
bool llamafile_sgemm_arm80(long, long, long, const void*, long, const void*, long, void*, long, int, int, int, int, int, int, int);
|
||||
bool llamafile_sgemm_arm82(long, long, long, const void*, long, const void*, long, void*, long, int, int, int, int, int, int, int);
|
||||
|
||||
bool llamafile_mixmul_unsupported(const struct ggml_compute_params*, const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*, struct ggml_tensor*);
|
||||
bool llamafile_mixmul_amd_avx(const struct ggml_compute_params*, const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*, struct ggml_tensor*);
|
||||
bool llamafile_mixmul_amd_fma(const struct ggml_compute_params*, const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*, struct ggml_tensor*);
|
||||
bool llamafile_mixmul_amd_avx2(const struct ggml_compute_params*, const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*, struct ggml_tensor*);
|
||||
bool llamafile_mixmul_amd_avxvnni(const struct ggml_compute_params*, const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*, struct ggml_tensor*);
|
||||
bool llamafile_mixmul_amd_avx512f(const struct ggml_compute_params*, const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*, struct ggml_tensor*);
|
||||
bool llamafile_mixmul_amd_zen4(const struct ggml_compute_params*, const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*, struct ggml_tensor*);
|
||||
bool llamafile_mixmul_arm80(const struct ggml_compute_params*, const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*, struct ggml_tensor*);
|
||||
bool llamafile_mixmul_arm82(const struct ggml_compute_params*, const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*, struct ggml_tensor*);
|
||||
bool llamafile_mixmul_iqk(long, long, long, int, int, const void*, const void*, float*, long, long, const void*, int, int);
|
||||
|
||||
#else
|
||||
|
||||
bool iqk_mul_mat(long, long, long,int, const void*, long, int, const void*, long,float*, long, int, int);
|
||||
bool iqk_mul_mat_zen4(long, long, long,int, const void*, long, int, const void*, long,float*, long, int, int);
|
||||
bool iqk_mul_mat_arm82(long, long, long,int, const void*, long, int, const void*, long,float*, long, int, int);
|
||||
|
||||
|
||||
bool iqk_mul_mat_moe(long, long, long, int, int, const void*, const void*, float*, long, long, const void*, int, int);
|
||||
bool iqk_mul_mat_moe_zen4(long, long, long, int, int, const void*, const void*, float*, long, long, const void*, int, int);
|
||||
bool iqk_mul_mat_moe_arm82(long, long, long, int, int, const void*, const void*, float*, long, long, const void*, int, int);
|
||||
bool iqk_mul_mat_moe_unsupported(long, long, long, int, int, const void*, const void*, float*, long, long, const void*, int, int);
|
||||
|
||||
bool llamafile_sgemm(long m, long n, long k, const void* a, long lda, const void* b, long ldb, void* c, long ldc, int ith, int nth, int task_type, int a_type, int b_type, int c_type, int precision);
|
||||
bool llamafile_mixmul(const struct ggml_compute_params*, const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*, struct ggml_tensor*);
|
||||
size_t llamafile_mixmul_needs(const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*);
|
||||
|
||||
bool llamafile_sgemm_unsupported(long, long, long, const void*, long, const void*, long, void*, long, int, int, int, int, int, int, int);
|
||||
bool llamafile_sgemm_amd_avx(long, long, long, const void*, long, const void*, long, void*, long, int, int, int, int, int, int, int);
|
||||
bool llamafile_sgemm_amd_fma(long, long, long, const void*, long, const void*, long, void*, long, int, int, int, int, int, int, int);
|
||||
bool llamafile_sgemm_amd_avx2(long, long, long, const void*, long, const void*, long, void*, long, int, int, int, int, int, int, int);
|
||||
bool llamafile_sgemm_amd_avxvnni(long, long, long, const void*, long, const void*, long, void*, long, int, int, int, int, int, int, int);
|
||||
bool llamafile_sgemm_amd_avx512f(long, long, long, const void*, long, const void*, long, void*, long, int, int, int, int, int, int, int);
|
||||
bool llamafile_sgemm_amd_zen4(long, long, long, const void*, long, const void*, long, void*, long, int, int, int, int, int, int, int);
|
||||
bool llamafile_sgemm_arm80(long, long, long, const void*, long, const void*, long, void*, long, int, int, int, int, int, int, int);
|
||||
bool llamafile_sgemm_arm82(long, long, long, const void*, long, const void*, long, void*, long, int, int, int, int, int, int, int);
|
||||
|
||||
bool llamafile_mixmul_unsupported(const struct ggml_compute_params*, const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*, struct ggml_tensor*);
|
||||
bool llamafile_mixmul_amd_avx(const struct ggml_compute_params*, const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*, struct ggml_tensor*);
|
||||
bool llamafile_mixmul_amd_fma(const struct ggml_compute_params*, const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*, struct ggml_tensor*);
|
||||
bool llamafile_mixmul_amd_avx2(const struct ggml_compute_params*, const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*, struct ggml_tensor*);
|
||||
bool llamafile_mixmul_amd_avxvnni(const struct ggml_compute_params*, const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*, struct ggml_tensor*);
|
||||
bool llamafile_mixmul_amd_avx512f(const struct ggml_compute_params*, const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*, struct ggml_tensor*);
|
||||
bool llamafile_mixmul_amd_zen4(const struct ggml_compute_params*, const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*, struct ggml_tensor*);
|
||||
bool llamafile_mixmul_arm80(const struct ggml_compute_params*, const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*, struct ggml_tensor*);
|
||||
bool llamafile_mixmul_arm82(const struct ggml_compute_params*, const struct ggml_tensor*, const struct ggml_tensor*, const struct ggml_tensor*, struct ggml_tensor*);
|
||||
bool llamafile_mixmul_iqk(long, long, long, int, int, const void*, const void*, float*, long, long, const void*, int, int);
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
1054
kt-kernel/third_party/llamafile/tinyblas_cpu.h
vendored
Normal file
1054
kt-kernel/third_party/llamafile/tinyblas_cpu.h
vendored
Normal file
File diff suppressed because it is too large
Load Diff
411
kt-kernel/third_party/llamafile/tinyblas_cpu_mixmul.inc
vendored
Normal file
411
kt-kernel/third_party/llamafile/tinyblas_cpu_mixmul.inc
vendored
Normal file
@@ -0,0 +1,411 @@
|
||||
// Adapted from
|
||||
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_mixmul.inc
|
||||
// Copyrigth 2024 Mozilla Foundation.
|
||||
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
|
||||
|
||||
// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
|
||||
// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
|
||||
//
|
||||
// Copyright 2024 Mozilla Foundation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "tinyblas_cpu.h"
|
||||
|
||||
//
|
||||
//
|
||||
// ██████╗ ██╗ █████╗ ██████╗
|
||||
// ██████╗██╗██╗ ██╗██═██╗██╔══██╗██║ ██╔══██╗██╔═══╝
|
||||
// ╚═██╔═╝██║███▄██║██ ██║██████╔╝██║ ███████║██████╗
|
||||
// ██║ ██║██▀███║╚███╔╝██╔══██╗██║ ██╔══██║╔═══██║
|
||||
// ██║ ██║██║ ██║ ███║ ██████╔╝████╗██║ ██║██████║
|
||||
// ╚═╝ ╚═╝╚═╝ ╚═╝ ╚══╝ ╚═════╝ ╚═══╝╚═╝ ╚═╝╚═════╝
|
||||
//
|
||||
// MIXTURE OF EXPERTS TENSOR MULTIPLICATION
|
||||
//
|
||||
//
|
||||
// SHAPES
|
||||
//
|
||||
// - weights [cols, rows, experts]
|
||||
// - thought [cols, tasks, tokens] w/ tasks ≤ thinkers
|
||||
// - result [rows, thinkers, tokens] w/ thinkers ≤ experts
|
||||
// - plan [thinkers, tokens] w/ i32 < experts
|
||||
//
|
||||
// DEFINITION
|
||||
//
|
||||
// for thinker in range(thinkers):
|
||||
// for token in range(tokens):
|
||||
// for row in range(rows):
|
||||
// c = 0
|
||||
// for col in range(cols):
|
||||
// expert = plan[token][thinker]
|
||||
// a = weights[expert][row][col]
|
||||
// b = thought[token][thinker % tasks][col]
|
||||
// c += a * b
|
||||
// result[token][thinker][row] = c
|
||||
//
|
||||
// REGULARITIES
|
||||
//
|
||||
// - tokens can be odd
|
||||
// - thinkers is usually 2
|
||||
// - tasks is usually 1 or 2
|
||||
// - cols should be a multiple of 64
|
||||
// - rows should be a multiple of 64
|
||||
// - experts is usually 8 but could be 60
|
||||
// - tokens is always 1 for token generation
|
||||
// - tokens can be huge for prompt processing
|
||||
//
|
||||
// EXAMPLE
|
||||
//
|
||||
// mixtral 8x7b w/ 217 token prompt
|
||||
//
|
||||
// | ne*0 ne*1 ne*2 ne*3 | nb*0 nb*1 nb*2 nb*3 | type
|
||||
// =========================================================================
|
||||
// weights | 16384 6144 8 1 | 18 0x2400 0x3600000 0x1b000000 | q4_0
|
||||
// thought | 16384 2 217 1 | 4 0x10000 0x20000 0x1b20000 | f32
|
||||
// result | 6144 2 217 1 | 4 0x6000 0xc000 0xa2c000 | f32
|
||||
// plan | 2 217 1 1 | 4 0x20 0x1b20 0x1b20 | i32
|
||||
//
|
||||
|
||||
namespace {
|
||||
|
||||
class MixMul {
|
||||
public:
|
||||
MixMul(const ggml_compute_params* params, const ggml_tensor* weights, const ggml_tensor* thought, const ggml_tensor* plan, ggml_tensor* result)
|
||||
: params(params),
|
||||
weights(weights),
|
||||
thought(thought),
|
||||
plan(plan),
|
||||
result(result),
|
||||
rows(weights->ne[1]),
|
||||
cols(weights->ne[0]),
|
||||
experts(weights->ne[2]),
|
||||
thinkers(plan->ne[0]),
|
||||
tasks(thought->ne[1]),
|
||||
tokens(thought->ne[2]),
|
||||
ldq((cols * 2 + ROW_ALIGN - 1) & -ROW_ALIGN),
|
||||
wdata_((char*)(((uintptr_t)params->wdata + MAX_ALIGN - 1) & -MAX_ALIGN)),
|
||||
allocated_(0) {
|
||||
}
|
||||
|
||||
bool allocate_shared_memory() {
|
||||
if (!(quantized_thought_ = allocate<char>(MATRIX_ALIGN, tokens * tasks * ldq)))
|
||||
return false;
|
||||
if (!(rowptr_result_ = allocate<uintptr_t>(ROW_ALIGN, experts * tokens * thinkers)))
|
||||
return false;
|
||||
if (!(rowptr_thought_ = allocate<uintptr_t>(ROW_ALIGN, experts * tokens * thinkers)))
|
||||
return false;
|
||||
if (!(rowptr_count_ = allocate<long>(sizeof(long), experts)))
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
size_t get_allocated_bytes() {
|
||||
return (wdata_ - (char*)params->wdata) + allocated_;
|
||||
}
|
||||
|
||||
bool mixmul() {
|
||||
// invariants
|
||||
assert(tasks <= thinkers);
|
||||
assert(thinkers <= experts);
|
||||
assert(tokens == plan->ne[1]);
|
||||
assert(rows == result->ne[0]);
|
||||
assert(cols == thought->ne[0]);
|
||||
assert(tokens == result->ne[2]);
|
||||
assert(thinkers == result->ne[1]);
|
||||
|
||||
// dimensionality
|
||||
assert(plan->ne[2] == 1);
|
||||
assert(plan->ne[3] == 1);
|
||||
assert(result->ne[3] == 1);
|
||||
assert(weights->ne[3] == 1);
|
||||
assert(thought->ne[3] == 1);
|
||||
|
||||
// miscellaneous
|
||||
assert(params->nth > 0);
|
||||
assert(params->ith < params->nth);
|
||||
assert(plan->type == GGML_TYPE_I32);
|
||||
|
||||
// check nb01 is convertible to lda
|
||||
if (weights->nb[1] % ggml_type_size(weights->type))
|
||||
return false;
|
||||
|
||||
// no support for column strides
|
||||
if (result->nb[0] != ggml_type_size(result->type))
|
||||
return false;
|
||||
if (thought->nb[0] != ggml_type_size(thought->type))
|
||||
return false;
|
||||
if (weights->nb[0] != ggml_type_size(weights->type))
|
||||
return false;
|
||||
|
||||
// supported output types
|
||||
switch (result->type) {
|
||||
case GGML_TYPE_F32:
|
||||
return mixmuler<float>();
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
template <typename TC>
|
||||
bool mixmuler() {
|
||||
switch (weights->type) {
|
||||
case GGML_TYPE_F32:
|
||||
if (thought->type != GGML_TYPE_F32)
|
||||
return false;
|
||||
#if defined(__AVX512F__)
|
||||
return mixmat<16, 1, tinyBLAS<NCB | NCC, 16, __m512, __m512, float, float, TC>, float,
|
||||
float, TC>();
|
||||
#elif defined(__AVX__) || defined(__AVX2__)
|
||||
return mixmat<8, 1, tinyBLAS<NCB | NCC, 8, __m256, __m256, float, float, TC>, float,
|
||||
float, TC>();
|
||||
#elif defined(__SSE__)
|
||||
return mixmat<4, 1, tinyBLAS<NCB | NCC, 4, __m128, __m128, float, float, TC>, float,
|
||||
float, TC>();
|
||||
#elif defined(__ARM_NEON)
|
||||
return mixmat<4, 1, tinyBLAS<NCB | NCC, 4, float32x4_t, float32x4_t, float, float, TC>,
|
||||
float, float, TC>();
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
|
||||
case GGML_TYPE_BF16:
|
||||
if (thought->type != GGML_TYPE_F32 && thought->type != GGML_TYPE_BF16)
|
||||
return false;
|
||||
#if defined(__AVX512BF16__)
|
||||
if (!FLAG_precise) {
|
||||
return mixmat<
|
||||
32, 1, tinyBLAS<NCB | NCC, 32, __m512, __m512bh, ggml_bf16_t, ggml_bf16_t, TC>,
|
||||
ggml_bf16_t, ggml_bf16_t, TC>();
|
||||
} else {
|
||||
return mixmat<16, 1,
|
||||
tinyBLAS<NCB | NCC, 16, __m512, __m512, ggml_bf16_t, ggml_bf16_t, TC>,
|
||||
ggml_bf16_t, ggml_bf16_t, TC>();
|
||||
}
|
||||
#elif defined(__AVX512F__)
|
||||
return mixmat<16, 1,
|
||||
tinyBLAS<NCB | NCC, 16, __m512, __m512, ggml_bf16_t, ggml_bf16_t, TC>,
|
||||
ggml_bf16_t, ggml_bf16_t, TC>();
|
||||
#elif defined(__AVX2__)
|
||||
return mixmat<8, 1,
|
||||
tinyBLAS<NCB | NCC, 8, __m256, __m256, ggml_bf16_t, ggml_bf16_t, TC>,
|
||||
ggml_bf16_t, ggml_bf16_t, TC>();
|
||||
#elif defined(__ARM_NEON) && !defined(_MSC_VER)
|
||||
return mixmat<
|
||||
4, 1,
|
||||
tinyBLAS<NCB | NCC, 4, float32x4_t, float32x4_t, ggml_bf16_t, ggml_bf16_t, TC>,
|
||||
ggml_bf16_t, ggml_bf16_t, TC>();
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
|
||||
case GGML_TYPE_F16:
|
||||
if (thought->type != GGML_TYPE_F32 && thought->type != GGML_TYPE_F16)
|
||||
return false;
|
||||
#if defined(__AVX512F__)
|
||||
return mixmat<16, 1,
|
||||
tinyBLAS<NCB | NCC, 16, __m512, __m512, ggml_fp16_t, ggml_fp16_t, TC>,
|
||||
ggml_fp16_t, ggml_fp16_t, TC>();
|
||||
#elif (defined(__AVX__) || defined(__AVX2__)) && defined(__F16C__)
|
||||
// if (X86_CHECK(F16C)) {
|
||||
return mixmat<8, 1,
|
||||
tinyBLAS<NCB | NCC, 8, __m256, __m256, ggml_fp16_t, ggml_fp16_t, TC>,
|
||||
ggml_fp16_t, ggml_fp16_t, TC>();
|
||||
// } else {
|
||||
// return false;
|
||||
// }
|
||||
#elif defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(_MSC_VER)
|
||||
if (result->op_params[0] == GGML_PREC_F32) {
|
||||
return mixmat<
|
||||
4, 1,
|
||||
tinyBLAS<NCB | NCC, 4, float32x4_t, float32x4_t, ggml_fp16_t, ggml_fp16_t, TC>,
|
||||
ggml_fp16_t, ggml_fp16_t, TC>();
|
||||
} else {
|
||||
return mixmat<
|
||||
8, 1,
|
||||
tinyBLAS<NCB | NCC, 8, float16x8_t, float16x8_t, ggml_fp16_t, ggml_fp16_t, TC>,
|
||||
ggml_fp16_t, ggml_fp16_t, TC>();
|
||||
}
|
||||
#elif defined(__ARM_NEON) && !defined(_MSC_VER)
|
||||
return mixmat<
|
||||
4, 1,
|
||||
tinyBLAS<NCB | NCC, 4, float32x4_t, float32x4_t, ggml_fp16_t, ggml_fp16_t, TC>,
|
||||
ggml_fp16_t, ggml_fp16_t, TC>();
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
|
||||
case GGML_TYPE_Q4_0:
|
||||
if (thought->type != GGML_TYPE_F32 && thought->type != GGML_TYPE_Q8_0)
|
||||
return false;
|
||||
#if defined(__AVX2__) || defined(__AVX512F__)
|
||||
return mixmat<32, 32, tinyBLAS_Q0_AVX2<NCB | NCC, block_q4_0, block_q8_0, TC>,
|
||||
block_q4_0, block_q8_0, TC>();
|
||||
#elif defined(__ARM_FEATURE_DOTPROD)
|
||||
return mixmat<32, 32, tinyBLAS_Q0_ARM<NCB | NCC, block_q4_0, block_q8_0, TC>,
|
||||
block_q4_0, block_q8_0, TC>();
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
|
||||
case GGML_TYPE_Q8_0:
|
||||
if (thought->type != GGML_TYPE_F32 && thought->type != GGML_TYPE_Q8_0)
|
||||
return false;
|
||||
#if defined(__AVX2__) || defined(__AVX512F__)
|
||||
return mixmat<32, 32, tinyBLAS_Q0_AVX2<NCB | NCC, block_q8_0, block_q8_0, TC>,
|
||||
block_q8_0, block_q8_0, TC>();
|
||||
#elif defined(__ARM_FEATURE_DOTPROD)
|
||||
return mixmat<32, 32, tinyBLAS_Q0_ARM<NCB | NCC, block_q8_0, block_q8_0, TC>,
|
||||
block_q8_0, block_q8_0, TC>();
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
template <int KN, int BS, typename BLAS, typename TA, typename TB, typename TC>
|
||||
bool mixmat() {
|
||||
if (cols % KN)
|
||||
return false;
|
||||
switch (params->type) {
|
||||
case GGML_TASK_TYPE_INIT:
|
||||
if (thought->type != ggml_type_trait<TB>::id)
|
||||
quantize_thought(ggml_type_trait<TB>::id);
|
||||
build_row_pointers(ggml_type_trait<TB>::id);
|
||||
return true;
|
||||
case GGML_TASK_TYPE_COMPUTE:
|
||||
assert(!(cols % BS));
|
||||
assert(!(weights->nb[1] % sizeof(TA)));
|
||||
for (int expert = 0; expert < experts; ++expert) {
|
||||
BLAS tb{cols / BS,
|
||||
(const TA*)((const char*)weights->data + expert * weights->nb[2]),
|
||||
(long)(weights->nb[1] / sizeof(TA)),
|
||||
(const TB*)(rowptr_thought_ + expert * tokens * thinkers),
|
||||
0,
|
||||
(TC*)(rowptr_result_ + expert * tokens * thinkers),
|
||||
0,
|
||||
params->ith,
|
||||
params->nth};
|
||||
tb.matmul(rows, rowptr_count_[expert], GGML_TASK_TYPE_COMPUTE);
|
||||
}
|
||||
return true;
|
||||
default:
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
void build_row_pointers(ggml_type vec_dot_type) {
|
||||
for (int expert = params->ith; expert < experts; expert += params->nth) {
|
||||
long count = 0;
|
||||
for (long token = 0; token < tokens; ++token)
|
||||
for (int thinker = 0; thinker < thinkers; ++thinker)
|
||||
if (expert == *(const int32_t*)((const char*)plan->data +
|
||||
token * plan->nb[1] + thinker * plan->nb[0])) {
|
||||
long row = count++;
|
||||
long idx = expert * thinkers * tokens + row;
|
||||
rowptr_result_[idx] =
|
||||
(uintptr_t)((char*)result->data + token * result->nb[2] +
|
||||
thinker * result->nb[1]);
|
||||
if (thought->type == vec_dot_type)
|
||||
rowptr_thought_[idx] =
|
||||
(uintptr_t)((char*)thought->data + token * thought->nb[2] +
|
||||
thinker % tasks * thought->nb[1]);
|
||||
else
|
||||
rowptr_thought_[idx] =
|
||||
(uintptr_t)((char*)quantized_thought_ + token * tasks * ldq +
|
||||
thinker % tasks * ldq);
|
||||
}
|
||||
rowptr_count_[expert] = count;
|
||||
}
|
||||
}
|
||||
|
||||
void quantize_thought(ggml_type vec_dot_type) {
|
||||
long chore = 0;
|
||||
for (long token = 0; token < tokens; ++token)
|
||||
for (int task = 0; task < tasks; ++task)
|
||||
if (chore++ % params->nth == params->ith)
|
||||
quantize_row(quantized_thought_ + token * tasks * ldq + task * ldq,
|
||||
(const float*)((const char*)thought->data +
|
||||
token * thought->nb[2] + task * thought->nb[1]),
|
||||
vec_dot_type);
|
||||
}
|
||||
|
||||
void quantize_row(void* dst, const float* src, ggml_type type) {
|
||||
assert((long)ggml_row_size(type, cols) <= ldq);
|
||||
switch (type) {
|
||||
case GGML_TYPE_F16:
|
||||
ggml_fp32_to_fp16_row(src, (ggml_fp16_t*)dst, cols);
|
||||
break;
|
||||
case GGML_TYPE_BF16:
|
||||
ggml_fp32_to_bf16_row(src, (ggml_bf16_t*)dst, cols);
|
||||
break;
|
||||
case GGML_TYPE_Q8_0:
|
||||
quantize_row_q8_0((const float*)src, (block_q8_0*)dst, cols);
|
||||
break;
|
||||
default:
|
||||
GGML_UNREACHABLE();
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
T* allocate(size_t align, size_t elems) {
|
||||
T* res = nullptr;
|
||||
size_t need = sizeof(T) * elems;
|
||||
size_t base = allocated_;
|
||||
base += align - 1;
|
||||
base &= -align;
|
||||
size_t toto = base + need;
|
||||
if (toto >= allocated_ && toto <= params->wsize) {
|
||||
res = (T*)(wdata_ + base);
|
||||
allocated_ = toto;
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
const ggml_compute_params* const params;
|
||||
const ggml_tensor* const weights;
|
||||
const ggml_tensor* const thought;
|
||||
const ggml_tensor* const plan;
|
||||
ggml_tensor* const result;
|
||||
const long rows;
|
||||
const long cols;
|
||||
const int experts;
|
||||
const int thinkers;
|
||||
const int tasks;
|
||||
const long tokens;
|
||||
const long ldq;
|
||||
|
||||
// variables
|
||||
char* const wdata_;
|
||||
size_t allocated_;
|
||||
|
||||
// shared memory
|
||||
long* rowptr_count_ /*[experts]*/;
|
||||
char* quantized_thought_ /*[tokens][tasks][cols][2]*/;
|
||||
uintptr_t* rowptr_result_ /*[experts][tokens*thinkers]*/;
|
||||
uintptr_t* rowptr_thought_ /*[experts][tokens*thinkers]*/;
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
/**
|
||||
* Performs "mixture of experts" tensor multiplication on CPU.
|
||||
*/
|
||||
bool llamafile_mixmul(const ggml_compute_params* params, const ggml_tensor* weights, const ggml_tensor* thought, const ggml_tensor* plan, ggml_tensor* result) {
|
||||
MixMul mm{params, weights, thought, plan, result};
|
||||
return mm.allocate_shared_memory() && mm.mixmul();
|
||||
}
|
||||
24
kt-kernel/third_party/llamafile/tinyblas_cpu_mixmul_amd_avx.cpp
vendored
Normal file
24
kt-kernel/third_party/llamafile/tinyblas_cpu_mixmul_amd_avx.cpp
vendored
Normal file
@@ -0,0 +1,24 @@
|
||||
// Adapted from
|
||||
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_mixmul_amd_avx.cpp
|
||||
// Copyrigth 2024 Mozilla Foundation.
|
||||
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
|
||||
|
||||
#if defined(__x86_64__) || defined(_M_X64)
|
||||
#define llamafile_mixmul llamafile_mixmul_amd_avx
|
||||
#include "tinyblas_cpu_mixmul.inc"
|
||||
|
||||
/**
|
||||
* Returns number of shared memory bytes llamafile_mixmul() needs.
|
||||
*/
|
||||
size_t llamafile_mixmul_needs(const ggml_tensor* weights, const ggml_tensor* thought, const ggml_tensor* plan) {
|
||||
ggml_compute_params params{};
|
||||
params.wsize = 0x7ffff000;
|
||||
params.wdata = (void*)0x1000;
|
||||
MixMul mm{¶ms, weights, thought, plan, 0};
|
||||
if (mm.allocate_shared_memory())
|
||||
return mm.get_allocated_bytes();
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif // __x86_64__
|
||||
9
kt-kernel/third_party/llamafile/tinyblas_cpu_mixmul_amd_avx2.cpp
vendored
Normal file
9
kt-kernel/third_party/llamafile/tinyblas_cpu_mixmul_amd_avx2.cpp
vendored
Normal file
@@ -0,0 +1,9 @@
|
||||
// Adapted from
|
||||
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_mixmul_amd_avx2.cpp
|
||||
// Copyrigth 2024 Mozilla Foundation.
|
||||
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
|
||||
|
||||
#if defined(__x86_64__) || defined(_M_X64)
|
||||
#define llamafile_mixmul llamafile_mixmul_amd_avx2
|
||||
#include "tinyblas_cpu_mixmul.inc"
|
||||
#endif // __x86_64__
|
||||
9
kt-kernel/third_party/llamafile/tinyblas_cpu_mixmul_amd_avx512f.cpp
vendored
Normal file
9
kt-kernel/third_party/llamafile/tinyblas_cpu_mixmul_amd_avx512f.cpp
vendored
Normal file
@@ -0,0 +1,9 @@
|
||||
// Adapted from
|
||||
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_mixmul_amd_avx512f.cpp
|
||||
// Copyrigth 2024 Mozilla Foundation.
|
||||
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
|
||||
|
||||
#if defined(__x86_64__) || defined(_M_X64)
|
||||
#define llamafile_mixmul llamafile_mixmul_amd_avx512f
|
||||
#include "tinyblas_cpu_mixmul.inc"
|
||||
#endif // __x86_64__
|
||||
9
kt-kernel/third_party/llamafile/tinyblas_cpu_mixmul_amd_avxvnni.cpp
vendored
Normal file
9
kt-kernel/third_party/llamafile/tinyblas_cpu_mixmul_amd_avxvnni.cpp
vendored
Normal file
@@ -0,0 +1,9 @@
|
||||
// Adapted from
|
||||
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_mixmul_amd_avxvnni.cpp
|
||||
// Copyrigth 2024 Mozilla Foundation.
|
||||
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
|
||||
|
||||
#if defined(__x86_64__) || defined(_M_X64)
|
||||
#define llamafile_mixmul llamafile_mixmul_amd_avxvnni
|
||||
#include "tinyblas_cpu_mixmul.inc"
|
||||
#endif // __x86_64__
|
||||
9
kt-kernel/third_party/llamafile/tinyblas_cpu_mixmul_amd_fma.cpp
vendored
Normal file
9
kt-kernel/third_party/llamafile/tinyblas_cpu_mixmul_amd_fma.cpp
vendored
Normal file
@@ -0,0 +1,9 @@
|
||||
// Adapted from
|
||||
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_mixmul_amd_fma.cpp
|
||||
// Copyrigth 2024 Mozilla Foundation.
|
||||
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
|
||||
|
||||
#if defined(__x86_64__) || defined(_M_X64)
|
||||
#define llamafile_mixmul llamafile_mixmul_amd_fma
|
||||
#include "tinyblas_cpu_mixmul.inc"
|
||||
#endif // __x86_64__
|
||||
9
kt-kernel/third_party/llamafile/tinyblas_cpu_mixmul_amd_zen4.cpp
vendored
Normal file
9
kt-kernel/third_party/llamafile/tinyblas_cpu_mixmul_amd_zen4.cpp
vendored
Normal file
@@ -0,0 +1,9 @@
|
||||
// Adapted from
|
||||
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_mixmul_amd_zen4.cpp
|
||||
// Copyrigth 2024 Mozilla Foundation.
|
||||
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
|
||||
|
||||
#if defined(__x86_64__) || defined(_M_X64)
|
||||
#define llamafile_mixmul llamafile_mixmul_amd_zen4
|
||||
#include "tinyblas_cpu_mixmul.inc"
|
||||
#endif // __x86_64__
|
||||
24
kt-kernel/third_party/llamafile/tinyblas_cpu_mixmul_arm80.cpp
vendored
Normal file
24
kt-kernel/third_party/llamafile/tinyblas_cpu_mixmul_arm80.cpp
vendored
Normal file
@@ -0,0 +1,24 @@
|
||||
// Adapted from
|
||||
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_mixmul_arm80.cpp
|
||||
// Copyrigth 2024 Mozilla Foundation.
|
||||
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
|
||||
|
||||
#ifdef __aarch64__
|
||||
#define llamafile_mixmul llamafile_mixmul_arm80
|
||||
#include "tinyblas_cpu_mixmul.inc"
|
||||
|
||||
/**
|
||||
* Returns number of shared memory bytes llamafile_mixmul() needs.
|
||||
*/
|
||||
size_t llamafile_mixmul_needs(const ggml_tensor* weights, const ggml_tensor* thought, const ggml_tensor* plan) {
|
||||
ggml_compute_params params{};
|
||||
params.wsize = 0x7ffff000;
|
||||
params.wdata = (void*)0x1000;
|
||||
MixMul mm{¶ms, weights, thought, plan, 0};
|
||||
if (mm.allocate_shared_memory())
|
||||
return mm.get_allocated_bytes();
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif // __aarch64__
|
||||
9
kt-kernel/third_party/llamafile/tinyblas_cpu_mixmul_arm82.cpp
vendored
Normal file
9
kt-kernel/third_party/llamafile/tinyblas_cpu_mixmul_arm82.cpp
vendored
Normal file
@@ -0,0 +1,9 @@
|
||||
// Adapted from
|
||||
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_mixmul_arm82.cpp
|
||||
// Copyrigth 2024 Mozilla Foundation.
|
||||
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
|
||||
|
||||
#ifdef __aarch64__
|
||||
#define llamafile_mixmul llamafile_mixmul_arm82
|
||||
#include "tinyblas_cpu_mixmul.inc"
|
||||
#endif // __aarch64__
|
||||
361
kt-kernel/third_party/llamafile/tinyblas_cpu_sgemm.inc
vendored
Normal file
361
kt-kernel/third_party/llamafile/tinyblas_cpu_sgemm.inc
vendored
Normal file
@@ -0,0 +1,361 @@
|
||||
// Adapted from
|
||||
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_sgemm.inc
|
||||
// Copyrigth 2024 Mozilla Foundation.
|
||||
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
|
||||
|
||||
// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
|
||||
// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
|
||||
//
|
||||
// Copyright 2024 Mozilla Foundation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "tinyblas_cpu.h"
|
||||
|
||||
//
|
||||
//
|
||||
// ██████╗ ██╗ █████╗ ██████╗
|
||||
// ██████╗██╗██╗ ██╗██═██╗██╔══██╗██║ ██╔══██╗██╔═══╝
|
||||
// ╚═██╔═╝██║███▄██║██ ██║██████╔╝██║ ███████║██████╗
|
||||
// ██║ ██║██▀███║╚███╔╝██╔══██╗██║ ██╔══██║╔═══██║
|
||||
// ██║ ██║██║ ██║ ███║ ██████╔╝████╗██║ ██║██████║
|
||||
// ╚═╝ ╚═╝╚═╝ ╚═╝ ╚══╝ ╚═════╝ ╚═══╝╚═╝ ╚═╝╚═════╝
|
||||
//
|
||||
// BASIC LINEAR ALGEBRA SUBPROGRAMS
|
||||
//
|
||||
//
|
||||
// This file implements multithreaded CPU matrix multiplication for the
|
||||
// common contiguous use case C = Aᵀ * B. These kernels are designed to
|
||||
// have excellent performance[1] for matrices that fit in the CPU cache
|
||||
// without imposing any overhead such as cache filling or malloc calls.
|
||||
//
|
||||
// This implementation does not guarantee any upper bound with rounding
|
||||
// errors, which grow along with k. Our goal's to maximally exploit the
|
||||
// hardware for performance, and then use whatever resources remain for
|
||||
// improving numerical accuracy.
|
||||
//
|
||||
// [1] J. Tunney, ‘LLaMA Now Goes Faster on CPUs’, Mar. 2024. [Online].
|
||||
// Available: https://justine.lol/matmul/. [Accessed: 29-Mar-2024].
|
||||
|
||||
namespace {
|
||||
|
||||
template <typename TC>
|
||||
bool llamafile_sgemm_impl(long m, long n, long k, const void* A, long lda, const void* B, long ldb, TC* C, long ldc, int ith, int nth, int task, int Atype, int Btype, int Ctype, int precision) {
|
||||
switch (Atype) {
|
||||
case GGML_TYPE_F32: {
|
||||
if (Btype != GGML_TYPE_F32)
|
||||
return NOT_SUPPORTED;
|
||||
#if defined(__AVX512F__)
|
||||
if (k % 16)
|
||||
return NOT_SUPPORTED;
|
||||
tinyBLAS<0, 16, __m512, __m512, float, float, TC> tb{
|
||||
k, (const float*)A, lda, (const float*)B, ldb, C, ldc, ith, nth};
|
||||
tb.matmul(m, n, task);
|
||||
return true;
|
||||
#elif defined(__AVX__) || defined(__AVX2__)
|
||||
if (k % 8)
|
||||
return NOT_SUPPORTED;
|
||||
tinyBLAS<0, 8, __m256, __m256, float, float, TC> tb{
|
||||
k, (const float*)A, lda, (const float*)B, ldb, C, ldc, ith, nth};
|
||||
tb.matmul(m, n, task);
|
||||
return true;
|
||||
#elif defined(__ARM_NEON)
|
||||
if (k % 4)
|
||||
return NOT_SUPPORTED;
|
||||
tinyBLAS<0, 4, float32x4_t, float32x4_t, float, float, TC> tb{
|
||||
k, (const float*)A, lda, (const float*)B, ldb, C, ldc, ith, nth};
|
||||
tb.matmul(m, n, task);
|
||||
return true;
|
||||
#else
|
||||
return NOT_SUPPORTED;
|
||||
#endif
|
||||
}
|
||||
|
||||
case GGML_TYPE_BF16: {
|
||||
#if defined(__AVX512BF16__)
|
||||
if (k % 32)
|
||||
return NOT_SUPPORTED;
|
||||
if (Btype == GGML_TYPE_F32 && n < 2) {
|
||||
tinyBLAS<0, 16, __m512, __m512, ggml_bf16_t, float, TC> tb{
|
||||
k, (const ggml_bf16_t*)A, lda, (const float*)B, ldb, C, ldc, ith, nth};
|
||||
tb.matmul(m, n, task);
|
||||
return true;
|
||||
}
|
||||
if (Btype == GGML_TYPE_F32)
|
||||
return WANT_QUANTIZATION;
|
||||
if (Btype != GGML_TYPE_BF16)
|
||||
return NOT_SUPPORTED;
|
||||
if (!FLAG_precise) {
|
||||
tinyBLAS<0, 32, __m512, __m512bh, ggml_bf16_t, ggml_bf16_t, TC> tb{
|
||||
k, (const ggml_bf16_t*)A, lda, (const ggml_bf16_t*)B, ldb, C, ldc, ith, nth};
|
||||
tb.matmul(m, n, task);
|
||||
return true;
|
||||
} else {
|
||||
tinyBLAS<0, 16, __m512, __m512, ggml_bf16_t, ggml_bf16_t, TC> tb{
|
||||
k, (const ggml_bf16_t*)A, lda, (const ggml_bf16_t*)B, ldb, C, ldc, ith, nth};
|
||||
tb.matmul(m, n, task);
|
||||
return true;
|
||||
}
|
||||
#elif defined(__AVX512F__)
|
||||
if (k % 16)
|
||||
return NOT_SUPPORTED;
|
||||
tinyBLAS<0, 16, __m512, __m512, ggml_bf16_t, float, TC> tb{
|
||||
k, (const ggml_bf16_t*)A, lda, (const float*)B, ldb, C, ldc, ith, nth};
|
||||
tb.matmul(m, n, task);
|
||||
return true;
|
||||
#elif defined(__AVX2__)
|
||||
if (k % 8)
|
||||
return NOT_SUPPORTED;
|
||||
if (Btype != GGML_TYPE_F32)
|
||||
return NOT_SUPPORTED;
|
||||
tinyBLAS<0, 8, __m256, __m256, ggml_bf16_t, float, TC> tb{
|
||||
k, (const ggml_bf16_t*)A, lda, (const float*)B, ldb, C, ldc, ith, nth};
|
||||
tb.matmul(m, n, task);
|
||||
return true;
|
||||
#elif defined(__ARM_NEON) && !defined(_MSC_VER)
|
||||
if (k % 4)
|
||||
return NOT_SUPPORTED;
|
||||
if (Btype != GGML_TYPE_F32)
|
||||
return NOT_SUPPORTED;
|
||||
tinyBLAS<0, 4, float32x4_t, float32x4_t, ggml_bf16_t, float, TC> tb{
|
||||
k, (const ggml_bf16_t*)A, lda, (const float*)B, ldb, C, ldc, ith, nth};
|
||||
tb.matmul(m, n, task);
|
||||
return true;
|
||||
#else
|
||||
return NOT_SUPPORTED;
|
||||
#endif
|
||||
}
|
||||
|
||||
case GGML_TYPE_F16: {
|
||||
#if defined(__AVX512F__)
|
||||
if (k % 16)
|
||||
return NOT_SUPPORTED;
|
||||
if (Btype == GGML_TYPE_F32 && n < 2) {
|
||||
tinyBLAS<0, 16, __m512, __m512, ggml_fp16_t, float, TC> tb{
|
||||
k, (const ggml_fp16_t*)A, lda, (const float*)B, ldb, C, ldc, ith, nth};
|
||||
tb.matmul(m, n, task);
|
||||
return true;
|
||||
}
|
||||
if (Btype == GGML_TYPE_F32)
|
||||
return WANT_QUANTIZATION;
|
||||
if (Btype != GGML_TYPE_F16)
|
||||
return NOT_SUPPORTED;
|
||||
tinyBLAS<0, 16, __m512, __m512, ggml_fp16_t, ggml_fp16_t, TC> tb{
|
||||
k, (const ggml_fp16_t*)A, lda, (const ggml_fp16_t*)B, ldb, C, ldc, ith, nth};
|
||||
tb.matmul(m, n, task);
|
||||
return true;
|
||||
#elif (defined(__AVX__) || defined(__AVX2__)) && defined(__F16C__)
|
||||
// if (X86_CHECK(F16C)) {
|
||||
if (k % 8)
|
||||
return NOT_SUPPORTED;
|
||||
if (Btype == GGML_TYPE_F32 && n < 2) {
|
||||
tinyBLAS<0, 8, __m256, __m256, ggml_fp16_t, float, TC> tb{
|
||||
k, (const ggml_fp16_t*)A, lda, (const float*)B, ldb, C, ldc, ith, nth};
|
||||
tb.matmul(m, n, task);
|
||||
return true;
|
||||
}
|
||||
if (Btype == GGML_TYPE_F32)
|
||||
return WANT_QUANTIZATION;
|
||||
if (Btype != GGML_TYPE_F16)
|
||||
return NOT_SUPPORTED;
|
||||
tinyBLAS<0, 8, __m256, __m256, ggml_fp16_t, ggml_fp16_t, TC> tb{
|
||||
k, (const ggml_fp16_t*)A, lda, (const ggml_fp16_t*)B, ldb, C, ldc, ith, nth};
|
||||
tb.matmul(m, n, task);
|
||||
return true;
|
||||
// } else {
|
||||
// return NOT_SUPPORTED;
|
||||
// }
|
||||
#elif defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(_MSC_VER)
|
||||
if (n < 2 && !FLAG_precise)
|
||||
// TODO(jart): Why is ggml_vec_dot_f16_unroll() so fast at matvec?
|
||||
return NOT_SUPPORTED;
|
||||
if (precision == GGML_PREC_F32) {
|
||||
if (k % 4)
|
||||
return NOT_SUPPORTED;
|
||||
if (Btype != GGML_TYPE_F32)
|
||||
return NOT_SUPPORTED;
|
||||
tinyBLAS<0, 4, float32x4_t, float32x4_t, ggml_fp16_t, float, TC> tb{
|
||||
k, (const ggml_fp16_t*)A, lda, (const float*)B, ldb, C, ldc, ith, nth};
|
||||
tb.matmul(m, n, task);
|
||||
return true;
|
||||
} else {
|
||||
if (k % 8)
|
||||
return NOT_SUPPORTED;
|
||||
if (Btype == GGML_TYPE_F32)
|
||||
return WANT_QUANTIZATION;
|
||||
if (Btype != GGML_TYPE_F16)
|
||||
return NOT_SUPPORTED;
|
||||
tinyBLAS<0, 8, float16x8_t, float16x8_t, ggml_fp16_t, ggml_fp16_t, TC> tb{
|
||||
k, (const ggml_fp16_t*)A, lda, (const ggml_fp16_t*)B, ldb, C, ldc, ith, nth};
|
||||
tb.matmul(m, n, task);
|
||||
return true;
|
||||
}
|
||||
#elif defined(__ARM_NEON) && !defined(_MSC_VER)
|
||||
if (n < 2 && !FLAG_precise)
|
||||
// TODO(jart): Why is ggml_vec_dot_f16_unroll() so fast at matvec?
|
||||
return NOT_SUPPORTED;
|
||||
if (k % 4)
|
||||
return NOT_SUPPORTED;
|
||||
if (Btype != GGML_TYPE_F32)
|
||||
return NOT_SUPPORTED;
|
||||
tinyBLAS<0, 4, float32x4_t, float32x4_t, ggml_fp16_t, float, TC> tb{
|
||||
k, (const ggml_fp16_t*)A, lda, (const float*)B, ldb, C, ldc, ith, nth};
|
||||
tb.matmul(m, n, task);
|
||||
return true;
|
||||
#else
|
||||
return NOT_SUPPORTED;
|
||||
#endif
|
||||
}
|
||||
|
||||
case GGML_TYPE_Q8_0: {
|
||||
if (Btype == GGML_TYPE_F32)
|
||||
return WANT_QUANTIZATION;
|
||||
if (Btype != GGML_TYPE_Q8_0)
|
||||
return NOT_SUPPORTED;
|
||||
#if defined(__AVX2__) || defined(__AVX512F__)
|
||||
tinyBLAS_Q0_AVX2<0, block_q8_0, block_q8_0, TC> tb{
|
||||
k, (const block_q8_0*)A, lda, (const block_q8_0*)B, ldb, C, ldc, ith, nth};
|
||||
tb.matmul(m, n, task);
|
||||
return true;
|
||||
#elif defined(__ARM_FEATURE_DOTPROD)
|
||||
tinyBLAS_Q0_ARM<0, block_q8_0, block_q8_0, TC> tb{
|
||||
k, (const block_q8_0*)A, lda, (const block_q8_0*)B, ldb, C, ldc, ith, nth};
|
||||
tb.matmul(m, n, task);
|
||||
return true;
|
||||
#else
|
||||
return NOT_SUPPORTED;
|
||||
#endif
|
||||
}
|
||||
|
||||
case GGML_TYPE_Q4_0: {
|
||||
if (Btype == GGML_TYPE_F32)
|
||||
return WANT_QUANTIZATION;
|
||||
if (Btype != GGML_TYPE_Q8_0)
|
||||
return NOT_SUPPORTED;
|
||||
#if defined(__AVX2__) || defined(__AVX512F__)
|
||||
tinyBLAS_Q0_AVX2<0, block_q4_0, block_q8_0, TC> tb{
|
||||
k, (const block_q4_0*)A, lda, (const block_q8_0*)B, ldb, C, ldc, ith, nth};
|
||||
tb.matmul(m, n, task);
|
||||
return true;
|
||||
#elif defined(__ARM_FEATURE_DOTPROD)
|
||||
tinyBLAS_Q0_ARM<0, block_q4_0, block_q8_0, TC> tb{
|
||||
k, (const block_q4_0*)A, lda, (const block_q8_0*)B, ldb, C, ldc, ith, nth};
|
||||
tb.matmul(m, n, task);
|
||||
return true;
|
||||
#else
|
||||
return NOT_SUPPORTED;
|
||||
#endif
|
||||
}
|
||||
|
||||
default:
|
||||
return NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
(void)m;
|
||||
(void)n;
|
||||
(void)k;
|
||||
(void)A;
|
||||
(void)lda;
|
||||
(void)B;
|
||||
(void)ldb;
|
||||
(void)C;
|
||||
(void)ldc;
|
||||
(void)ith;
|
||||
(void)nth;
|
||||
(void)Atype;
|
||||
(void)Btype;
|
||||
(void)precision;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
/**
|
||||
* Performs optimized matrix multiplication on CPU.
|
||||
*
|
||||
* This subroutine may compute C = Aᵀ * B with column major ordering.
|
||||
* Despite its name, this isn't a generalized implementation. Work is
|
||||
* only performed when a handwritten kernel is written and available.
|
||||
* Otherwise the caller should fall back to a general matmul routine.
|
||||
*
|
||||
* For example, for single-threaded single-precision GEMM you can say
|
||||
*
|
||||
* llamafile_sgemm(m, n, k, A, lda, B, ldb, C, ldc, 0, 1,
|
||||
* GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32,
|
||||
* GGML_PREC_DEFAULT);
|
||||
*
|
||||
* @param m is rows in `A` and `C`
|
||||
* @param n is cols in `B` and `C`
|
||||
* @param k is cols in `A` and rows in `B`
|
||||
* @param A is first input matrix (always transposed)
|
||||
* @param lda is row stride of `A`
|
||||
* @param B is second input matrix (never transposed)
|
||||
* @param ldb is row stride of `B`
|
||||
* @param C is input/output array of output matrices
|
||||
* @param ldc is row stride of `C`
|
||||
* @param ith is thread id (must be less than `nth`)
|
||||
* @param nth is number of threads (must be greater than zero)
|
||||
* @param Atype is GGML data type of `A`
|
||||
* @param Btype is GGML data type of `B`
|
||||
* @param Ctype is GGML data type of `C`
|
||||
* @param precision may be used to control the internal compute type
|
||||
* @return true if this function was able to service the matmul request
|
||||
*/
|
||||
bool llamafile_sgemm(long m, long n, long k, const void* A, long lda, const void* B, long ldb, void* C, long ldc, int ith, int nth, int task, int Atype, int Btype, int Ctype, int precision) {
|
||||
assert(m >= 0);
|
||||
assert(n >= 0);
|
||||
assert(k >= 0);
|
||||
assert(lda >= k);
|
||||
assert(ldb >= k);
|
||||
assert(ldc >= m);
|
||||
assert(nth > 0);
|
||||
assert(ith < nth);
|
||||
|
||||
#if QK_K == 256
|
||||
#if defined(__x86_64__) || defined(_M_X64)
|
||||
#if defined(__AVX2__) && (defined(__FMA__) || (defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))))
|
||||
/*
|
||||
moonll
|
||||
more Btype accept
|
||||
}*/
|
||||
|
||||
if (Ctype == GGML_TYPE_F32){
|
||||
if (iqk_mul_mat(m, n, k * ggml_blck_size(ggml_type(Atype)), Atype, A,lda,Btype, B,ldb, (float*)C, ldc, ith, nth)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
#elif defined __aarch64__ && defined __ARM_FEATURE_DOTPROD && !defined _MSC_VER
|
||||
if (Btype == GGML_TYPE_Q8_K && Ctype == GGML_TYPE_F32) {
|
||||
if (iqk_mul_mat(m, n, k * QK_K, Atype, A, B, (float*)C, ldc, ith, nth)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
if ((Btype == GGML_TYPE_Q8_0 || Btype == GGML_TYPE_Q8_1) && Ctype == GGML_TYPE_F32) {
|
||||
// assert(QK8_0 == QK8_1 == QK4_0 == QK4_1 == QK5_0 == QK5_1 == 32);
|
||||
assert((QK8_0 == 32) && (QK8_1 == 32) && (QK4_0 == 32) && (QK4_1 == 32) && (QK5_0 == 32) && (QK5_1 == 32));
|
||||
if (iqk_mul_mat(m, n, k * QK8_0, Atype, A, B, (float*)C, ldc, ith, nth)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
switch (Ctype) {
|
||||
case GGML_TYPE_F32:
|
||||
return llamafile_sgemm_impl(m, n, k, A, lda, B, ldb, (float*)C, ldc, ith, nth, task, Atype,
|
||||
Btype, Ctype, precision);
|
||||
default:
|
||||
return NOT_SUPPORTED;
|
||||
}
|
||||
}
|
||||
9
kt-kernel/third_party/llamafile/tinyblas_cpu_sgemm_amd_avx.cpp
vendored
Normal file
9
kt-kernel/third_party/llamafile/tinyblas_cpu_sgemm_amd_avx.cpp
vendored
Normal file
@@ -0,0 +1,9 @@
|
||||
// Adapted from
|
||||
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_sgemm_amd_avx.cpp
|
||||
// Copyrigth 2024 Mozilla Foundation.
|
||||
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
|
||||
|
||||
#if defined(__x86_64__) || defined(_M_X64)
|
||||
#define llamafile_sgemm llamafile_sgemm_amd_avx
|
||||
#include "tinyblas_cpu_sgemm.inc"
|
||||
#endif // __x86_64__
|
||||
9
kt-kernel/third_party/llamafile/tinyblas_cpu_sgemm_amd_avx2.cpp
vendored
Normal file
9
kt-kernel/third_party/llamafile/tinyblas_cpu_sgemm_amd_avx2.cpp
vendored
Normal file
@@ -0,0 +1,9 @@
|
||||
// Adapted from
|
||||
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_sgemm_amd_avx2.cpp
|
||||
// Copyrigth 2024 Mozilla Foundation.
|
||||
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
|
||||
|
||||
#if defined(__x86_64__) || defined(_M_X64)
|
||||
#define llamafile_sgemm llamafile_sgemm_amd_avx2
|
||||
#include "tinyblas_cpu_sgemm.inc"
|
||||
#endif // __x86_64__
|
||||
9
kt-kernel/third_party/llamafile/tinyblas_cpu_sgemm_amd_avx512f.cpp
vendored
Normal file
9
kt-kernel/third_party/llamafile/tinyblas_cpu_sgemm_amd_avx512f.cpp
vendored
Normal file
@@ -0,0 +1,9 @@
|
||||
// Adapted from
|
||||
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_sgemm_amd_avx512f.cpp
|
||||
// Copyrigth 2024 Mozilla Foundation.
|
||||
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
|
||||
|
||||
#if defined(__x86_64__) || defined(_M_X64)
|
||||
#define llamafile_sgemm llamafile_sgemm_amd_avx512f
|
||||
#include "tinyblas_cpu_sgemm.inc"
|
||||
#endif // __x86_64__
|
||||
9
kt-kernel/third_party/llamafile/tinyblas_cpu_sgemm_amd_avxvnni.cpp
vendored
Normal file
9
kt-kernel/third_party/llamafile/tinyblas_cpu_sgemm_amd_avxvnni.cpp
vendored
Normal file
@@ -0,0 +1,9 @@
|
||||
// Adapted from
|
||||
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_sgemm_amd_avxvnni.cpp
|
||||
// Copyrigth 2024 Mozilla Foundation.
|
||||
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
|
||||
|
||||
#if defined(__x86_64__) || defined(_M_X64)
|
||||
#define llamafile_sgemm llamafile_sgemm_amd_avxvnni
|
||||
#include "tinyblas_cpu_sgemm.inc"
|
||||
#endif // __x86_64__
|
||||
9
kt-kernel/third_party/llamafile/tinyblas_cpu_sgemm_amd_fma.cpp
vendored
Normal file
9
kt-kernel/third_party/llamafile/tinyblas_cpu_sgemm_amd_fma.cpp
vendored
Normal file
@@ -0,0 +1,9 @@
|
||||
// Adapted from
|
||||
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_sgemm_amd_fma.cpp
|
||||
// Copyrigth 2024 Mozilla Foundation.
|
||||
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
|
||||
|
||||
#if defined(__x86_64__) || defined(_M_X64)
|
||||
#define llamafile_sgemm llamafile_sgemm_amd_fma
|
||||
#include "tinyblas_cpu_sgemm.inc"
|
||||
#endif // __x86_64__
|
||||
10
kt-kernel/third_party/llamafile/tinyblas_cpu_sgemm_amd_zen4.cpp
vendored
Normal file
10
kt-kernel/third_party/llamafile/tinyblas_cpu_sgemm_amd_zen4.cpp
vendored
Normal file
@@ -0,0 +1,10 @@
|
||||
// Adapted from
|
||||
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_sgemm_amd_zen4.cpp
|
||||
// Copyrigth 2024 Mozilla Foundation.
|
||||
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
|
||||
|
||||
#if defined(__x86_64__) || defined(_M_X64)
|
||||
#define llamafile_sgemm llamafile_sgemm_amd_zen4
|
||||
#define iqk_mul_mat iqk_mul_mat_zen4
|
||||
#include "tinyblas_cpu_sgemm.inc"
|
||||
#endif // __x86_64__
|
||||
9
kt-kernel/third_party/llamafile/tinyblas_cpu_sgemm_arm80.cpp
vendored
Normal file
9
kt-kernel/third_party/llamafile/tinyblas_cpu_sgemm_arm80.cpp
vendored
Normal file
@@ -0,0 +1,9 @@
|
||||
// // Adapted from
|
||||
// // https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_sgemm_arm80.cpp
|
||||
// // Copyrigth 2024 Mozilla Foundation.
|
||||
// // Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
|
||||
|
||||
// #ifdef __aarch64__
|
||||
// #define llamafile_sgemm llamafile_sgemm_arm80
|
||||
// #include "tinyblas_cpu_sgemm.inc"
|
||||
// #endif // __aarch64__
|
||||
10
kt-kernel/third_party/llamafile/tinyblas_cpu_sgemm_arm82.cpp
vendored
Normal file
10
kt-kernel/third_party/llamafile/tinyblas_cpu_sgemm_arm82.cpp
vendored
Normal file
@@ -0,0 +1,10 @@
|
||||
// Adapted from
|
||||
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_sgemm_arm82.cpp
|
||||
// Copyrigth 2024 Mozilla Foundation.
|
||||
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
|
||||
|
||||
#ifdef __aarch64__
|
||||
#define llamafile_sgemm llamafile_sgemm_arm82
|
||||
#define iqk_mul_mat iqk_mul_mat_arm82
|
||||
#include "tinyblas_cpu_sgemm.inc"
|
||||
#endif // __aarch64__
|
||||
39
kt-kernel/third_party/llamafile/tinyblas_cpu_unsupported.cpp
vendored
Normal file
39
kt-kernel/third_party/llamafile/tinyblas_cpu_unsupported.cpp
vendored
Normal file
@@ -0,0 +1,39 @@
|
||||
// Adapted from
|
||||
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_unsupported.cpp
|
||||
// Copyrigth 2024 Mozilla Foundation.
|
||||
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
|
||||
|
||||
// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
|
||||
// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
|
||||
//
|
||||
// Copyright 2024 Mozilla Foundation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "sgemm.h"
|
||||
|
||||
bool llamafile_sgemm_unsupported(long m, long n, long k, const void* A, long lda, const void* B, long ldb, void* C, long ldc, int ith, int nth, int task, int Atype, int Btype, int Ctype, int precision) {
|
||||
return false;
|
||||
}
|
||||
|
||||
bool llamafile_mixmul_unsupported(const struct ggml_compute_params* params,
|
||||
const struct ggml_tensor* weights,
|
||||
const struct ggml_tensor* thought,
|
||||
const struct ggml_tensor* plan,
|
||||
struct ggml_tensor* result) {
|
||||
return false;
|
||||
}
|
||||
|
||||
bool iqk_mul_mat_moe_unsupported(long, long, long, int, int, const void*, const void*, float*, long, long, const void*, int, int) {
|
||||
return false;
|
||||
}
|
||||
Reference in New Issue
Block a user