mirror of
https://github.com/kvcache-ai/ktransformers.git
synced 2026-05-19 20:09:19 +00:00
V4-Flash MXFP4 full-GPU prefill fallback now works: - Previously crashed all TP schedulers with StopIteration/AttributeError whenever --kt-gpu-prefill-token-threshold was low enough to actually fire (path was hardcoded for FP8/INT4 layouts). - Now detects MXFP4, re-runs the V4 swizzle on the 256-expert gpu_layer, caches the load across prefill chunks. - Measured on 8x RTX 5090 (threshold=1024, chunked=1024): 16k input -> 2011 tok/s, 65k -> 2798, 262k -> 2154 prefill TPS.
30 lines
689 B
Python
30 lines
689 B
Python
"""Lightweight top-level package: pip install ktransformers -> installs kt-kernel.
|
|
|
|
Extras:
|
|
- ktransformers[sft] installs transformers-kt + accelerate-kt
|
|
- ktransformers[sglang] installs sglang-kt
|
|
"""
|
|
from pathlib import Path
|
|
from setuptools import setup
|
|
|
|
_version_file = Path(__file__).resolve().parent / "version.py"
|
|
_ns = {}
|
|
exec(_version_file.read_text(), _ns)
|
|
_v = _ns["__version__"]
|
|
|
|
setup(
|
|
version=_v,
|
|
install_requires=[
|
|
f"kt-kernel=={_v}",
|
|
],
|
|
extras_require={
|
|
"sft": [
|
|
"transformers-kt==5.6.0.post1",
|
|
"accelerate-kt==1.14.0.post1",
|
|
],
|
|
"sglang": [
|
|
"sglang-kt==0.6.2.post1",
|
|
],
|
|
},
|
|
)
|