Merge branch 'main' into draft-split

2026-03-14 15:57:27 +00:00 · 2025-02-08 15:10:44 -05:00
parent ab1f4b7a6a dcbf2de9e5
commit bd8256d168
14 changed files with 287 additions and 226 deletions
--- a/backends/exllamav2/grammar.py
+++ b/backends/exllamav2/grammar.py
@@ -1,110 +1,16 @@
 import traceback
-from exllamav2 import ExLlamaV2, ExLlamaV2Tokenizer
-from exllamav2.generator.filters import ExLlamaV2Filter, ExLlamaV2PrefixFilter
-from lmformatenforcer import (
-    JsonSchemaParser,
-    RegexParser,
-    TokenEnforcer,
-    CharacterLevelParser,
-)
-from lmformatenforcer.integrations.exllamav2 import (
-    build_token_enforcer_tokenizer_data,
-)
-from loguru import logger
-from typing import List
+import typing
 from functools import lru_cache
+from typing import List

-
-class OutlinesTokenizerWrapper:
-    """Wrapper for Outlines tokenizer"""
-
-    def __init__(self, tokenizer):
-        self.tokenizer = tokenizer
-        id_to_piece = self.tokenizer.get_id_to_piece_list()
-        self.vocabulary = {piece: idx for idx, piece in enumerate(id_to_piece)}
-        self.eos_token_id = self.tokenizer.eos_token_id
-        self.eos_token = id_to_piece[self.tokenizer.eos_token_id]
-        self.special_tokens = list(self.tokenizer.extended_id_to_piece.keys())
-
-    def convert_token_to_string(self, token):
-        return token
-
-    def decode(self, tokens):
-        s = ""
-        id_to_piece = self.tokenizer.get_id_to_piece_list()
-        for t in tokens:
-            s += id_to_piece[t]
-        return s
-
-
-class ExLlamaV2EbnfFilter(ExLlamaV2Filter):
-    """Filter class for context-free grammar via outlines"""
-
-    def __init__(self, model, tokenizer, grammar):
-        from outlines.fsm.fsm import CFGFSM
-
-        super().__init__(model, tokenizer)
-
-        self.wrapped_tokenizer = OutlinesTokenizerWrapper(tokenizer)
-        self.fsm = CFGFSM(grammar, self.wrapped_tokenizer)
-        self.state = self.fsm.first_state
-
-    def begin(self, prefix_str=""):
-        self.state = self.fsm.first_state
-
-    def feed(self, token):
-        self.state = self.fsm.next_state(self.state, token.item())
-
-    def next(self):
-        return self.fsm.allowed_token_ids(self.state), set()
-
-    def use_background_worker(self):
-        return True
-
-
-@lru_cache(10)
-def _get_lmfe_tokenizer_data(tokenizer: ExLlamaV2Tokenizer):
-    return build_token_enforcer_tokenizer_data(tokenizer)
-
-
-class ExLlamaV2TokenEnforcerFilter(ExLlamaV2Filter):
-    """Filter class for LMFE"""
-
-    token_sequence: List[int]
-
-    def __init__(
-        self,
-        model: ExLlamaV2,
-        tokenizer: ExLlamaV2Tokenizer,
-        character_level_parser: CharacterLevelParser,
-    ):
-        super().__init__(model, tokenizer)
-        tokenizer_data = _get_lmfe_tokenizer_data(tokenizer)
-        self.token_enforcer = TokenEnforcer(tokenizer_data, character_level_parser)
-        self.token_sequence = []
-
-    def begin(self, prefix_str: str):
-        self.token_sequence = []
-
-    def feed(self, token):
-        self.token_sequence.append(int(token[0][0]))
-
-    def next(self):
-        allowed_tokens = self.token_enforcer.get_allowed_tokens(self.token_sequence)
-        if not hasattr(self, "allow_return_type_list"):
-            return set(allowed_tokens), set()
-        else:
-            return sorted(allowed_tokens), []
-
-    def use_background_worker(self):
-        return True
-
-
-def clear_grammar_func_cache():
-    """Flush tokenizer_data cache to avoid holding references to
-    tokenizers after unloading a model"""
-
-    _get_lmfe_tokenizer_data.cache_clear()
+import torch
+from exllamav2 import ExLlamaV2, ExLlamaV2Tokenizer
+from exllamav2.generator.filters import ExLlamaV2Filter
+from formatron.extractor import NonterminalExtractor
+from formatron.formatter import FormatterBuilder
+from formatron.integrations.exllamav2 import FormatterFilter, create_engine_vocabulary
+from formatron.schemas import json_schema
+from loguru import logger


 class ExLlamaV2Grammar:
@@ -117,7 +23,7 @@ class ExLlamaV2Grammar:

    def add_json_schema_filter(
        self,
-        json_schema: dict,
+        schema: dict,
        model: ExLlamaV2,
        tokenizer: ExLlamaV2Tokenizer,
    ):
@@ -125,7 +31,16 @@ class ExLlamaV2Grammar:

        # Create the parser
        try:
-            schema_parser = JsonSchemaParser(json_schema)
+            # Add fields required by formatron if not present
+            if "$id" not in schema:
+                schema["$id"] = "https://example.com/example.json"
+            if "$schema" not in schema:
+                schema["$schema"] = "http://json-schema.org/draft-07/schema#"
+
+            # Validate schema and create formatter
+            schema = json_schema.create_schema(schema)
+            f = FormatterBuilder()
+            f.append_line(f"{f.json(schema)}")
        except Exception:
            traceback.print_exc()
            logger.error(
@@ -135,14 +50,10 @@ class ExLlamaV2Grammar:

            return

-        # Allow JSON objects or JSON arrays at the top level
-        json_prefixes = ["[", "{"]
-
-        lmfilter = ExLlamaV2TokenEnforcerFilter(model, tokenizer, schema_parser)
-        prefix_filter = ExLlamaV2PrefixFilter(model, tokenizer, json_prefixes)
+        lmfilter = _create_formatter_filter(model, tokenizer, f)

        # Append the filters
-        self.filters.extend([lmfilter, prefix_filter])
+        self.filters.append(lmfilter)

    def add_regex_filter(
        self,
@@ -154,7 +65,9 @@ class ExLlamaV2Grammar:

        # Create the parser
        try:
-            pattern_parser = RegexParser(pattern)
+            # Validate regex and create formatter
+            f = FormatterBuilder()
+            f.append_line(f"{f.regex(pattern)}")
        except Exception:
            traceback.print_exc()
            logger.error(
@@ -164,32 +77,82 @@ class ExLlamaV2Grammar:

            return

-        lmfilter = ExLlamaV2TokenEnforcerFilter(model, tokenizer, pattern_parser)
+        lmfilter = _create_formatter_filter(model, tokenizer, f)

        # Append the filters
        self.filters.append(lmfilter)

-    def add_ebnf_filter(
+    def add_kbnf_filter(
        self,
-        ebnf_string: str,
+        kbnf_string: str,
        model: ExLlamaV2,
        tokenizer: ExLlamaV2Tokenizer,
    ):
-        """
-        Add an EBNF grammar filter.
-        Possibly replace outlines with an in-house solution in the future.
-        """
+        """Adds an ExllamaV2 filter based on KBNF grammar."""

+        # Create the parser
        try:
-            ebnf_filter = ExLlamaV2EbnfFilter(model, tokenizer, ebnf_string)
-        except ImportError:
+            # Validate KBNF and create formatter
+            f = FormatterBuilder()
+            f.append_line(
+                f"""{f.extractor(lambda nonterminal:
+                    CFGExtractor(nonterminal, kbnf_string))}"""
+            )
+        except Exception:
            logger.error(
-                "Skipping EBNF parsing because Outlines is not installed.\n"
-                "Please run the following command in your environment "
-                "to install extra packages:\n"
-                "pip install -U .[extras]"
+                "Skipping because the KBNF string couldn't be parsed. "
+                "Please read the above error for more information."
            )

            return

-        self.filters.append(ebnf_filter)
+        lmfilter = _create_formatter_filter(model, tokenizer, f)
+
+        # Append the filters
+        self.filters.append(lmfilter)
+
+
+class CFGExtractor(NonterminalExtractor):
+    """Extractor class for KBNF context-free grammar"""
+
+    def __init__(self, nonterminal: str, kbnf_string: str):
+        super().__init__(nonterminal)
+        self.kbnf_string = kbnf_string
+
+    # Return the entire input string as the extracted string
+    def extract(self, input_str: str) -> typing.Optional[tuple[str, typing.Any]]:
+        return "", input_str
+
+    @property
+    def kbnf_definition(self) -> str:
+        return self.kbnf_string.replace("start", self.nonterminal)
+
+
+@lru_cache(1)
+def _create_cached_engine_vocabulary(tokenizer: ExLlamaV2Tokenizer):
+    """Build and cache engine vocabulary on first grammar run"""
+
+    return create_engine_vocabulary(tokenizer)
+
+
+def _create_formatter_filter(
+    model: ExLlamaV2, tokenizer: ExLlamaV2Tokenizer, formatter_builder: FormatterBuilder
+) -> ExLlamaV2Filter:
+    """
+    Create a formatter filter for the ExLlamaV2 engine.
+    Minimalist clone of formatron.integrations.exllamav2.create_formatter_filter
+    with lru_cache enabled for engine vocabulary
+    """
+
+    vocab = _create_cached_engine_vocabulary(tokenizer)
+    f = formatter_builder.build(
+        vocab, lambda tokens: tokenizer.decode(torch.tensor(tokens))
+    )
+    return FormatterFilter(model, tokenizer, f)
+
+
+def clear_grammar_func_cache():
+    """Flush tokenizer_data cache to avoid holding references to
+    tokenizers after unloading a model"""
+
+    _create_cached_engine_vocabulary.cache_clear()
--- a/backends/exllamav2/model.py
+++ b/backends/exllamav2/model.py
@@ -498,16 +498,18 @@ class ExllamaV2Container:
            "rope_scale": self.config.scale_pos_emb,
            "rope_alpha": self.config.scale_alpha_value,
            "max_seq_len": self.config.max_seq_len,
+            "max_batch_size": self.max_batch_size,
            "cache_size": self.cache_size,
            "cache_mode": self.cache_mode,
            "chunk_size": self.config.max_input_len,
            "num_experts_per_token": self.config.num_experts_per_token,
-            "prompt_template": self.prompt_template.name
-            if self.prompt_template
-            else None,
            "use_vision": self.use_vision,
        }

+        if self.prompt_template:
+            model_params["prompt_template"] = self.prompt_template.name
+            model_params["prompt_template_content"] = self.prompt_template.raw_template
+
        if self.draft_config:
            draft_model_params = {
                "name": self.draft_model_dir.name,
@@ -787,6 +789,10 @@ class ExllamaV2Container:
                max_batch_size=self.max_batch_size,
                paged=self.paged,
            )
+
+            # Update the state of the container var
+            if self.max_batch_size is None:
+                self.max_batch_size = self.generator.generator.max_batch_size
        finally:
            # This means the generator is being recreated
            # The load lock is already released in the load function
@@ -1222,7 +1228,7 @@ class ExllamaV2Container:
        # Add EBNF filter if it exists
        grammar_string = unwrap(kwargs.get("grammar_string"))
        if grammar_string:
-            grammar_handler.add_ebnf_filter(grammar_string, self.model, self.tokenizer)
+            grammar_handler.add_kbnf_filter(grammar_string, self.model, self.tokenizer)

        # Set banned strings
        banned_strings: List[str] = unwrap(kwargs.get("banned_strings"), [])
@@ -1329,17 +1335,49 @@ class ExllamaV2Container:

        # The first index will always be the positive prompt
        context_len = input_ids[0].size(dim=-1)
-        if context_len > self.config.max_seq_len:
-            raise ValueError(
-                f"Context length {context_len} is greater than max_seq_len "
-                f"{self.config.max_seq_len}"
-            )
+
+        # The second index will be the negative prompt if CFG is enabled
+        negative_context_len = input_ids[1].size(dim=-1) if negative_prompt else 0

        # Automatically set max_tokens to fill up the context
        # This should be an OK default, but may be changed in the future
        max_tokens = unwrap(
-            kwargs.get("max_tokens"), self.config.max_seq_len - context_len
+            kwargs.get("max_tokens"),
+            self.config.max_seq_len - max(context_len, negative_context_len),
        )
+        if max_tokens < 1:
+            logger.warning("max_tokens must be a positive integer, setting to 1.")
+            max_tokens = 1
+
+        # Determine if the negative context or the context length is bigger
+        context_to_check = max(negative_context_len, context_len)
+
+        # Check highest possible total length of request
+        if context_to_check + max_tokens > self.config.max_seq_len:
+            preamble = (
+                "Negative prompt request"
+                if negative_context_len > context_len
+                else "Request"
+            )
+
+            raise ValueError(
+                f"{preamble} length {context_to_check} + {max_tokens} is greater than "
+                f"max_seq_len {self.config.max_seq_len}"
+            )
+
+        # Check total required pages for CFG request to avoid overallocation
+        if negative_prompt and (
+            sum(
+                256 * math.ceil((context + max_tokens) / 256)
+                for context in (context_len, negative_context_len)
+            )
+            > self.cache_size
+        ):
+            raise ValueError(
+                f"Total required page size for request "
+                f"{context_len} + {negative_context_len} + {max_tokens} * 2 "
+                f"is greater than cache_size {self.cache_size}"
+            )

        # Set min_tokens to generate while keeping EOS banned
        min_tokens = unwrap(kwargs.get("min_tokens"), 0)
--- a/common/logger.py
+++ b/common/logger.py
@@ -52,6 +52,10 @@ def _log_formatter(record: dict):
        "ERROR": "red",
        "CRITICAL": "bold white on red",
    }
+
+    time = record.get("time")
+    colored_time = f"[grey37]{time:YYYY-DD-MM HH:mm:ss.SSS}[/grey37]"
+
    level = record.get("level")
    level_color = color_map.get(level.name, "cyan")
    colored_level = f"[{level_color}]{level.name}[/{level_color}]:"
@@ -69,9 +73,11 @@ def _log_formatter(record: dict):

    fmt = ""
    if len(lines) > 1:
-        fmt = "\n".join([f"{colored_level}{separator}{line}" for line in lines])
+        fmt = "\n".join(
+            [f"{colored_time} {colored_level}{separator}{line}" for line in lines]
+        )
    else:
-        fmt = f"{colored_level}{separator}{message}"
+        fmt = f"{colored_time} {colored_level}{separator}{message}"

    return fmt

--- a/common/multimodal.py
+++ b/common/multimodal.py
@@ -1,7 +1,8 @@
-from typing import List
 from backends.exllamav2.vision import get_image_embedding
 from common import model
 from loguru import logger
+from pydantic import BaseModel, Field
+from typing import List

 from common.optional_dependencies import dependencies

@@ -9,12 +10,12 @@ if dependencies.exllamav2:
    from exllamav2 import ExLlamaV2VisionTower


-class MultimodalEmbeddingWrapper:
+class MultimodalEmbeddingWrapper(BaseModel):
    """Common multimodal embedding wrapper"""

    type: str = None
-    content: List = []
-    text_alias: List[str] = []
+    content: list = Field(default_factory=list)
+    text_alias: List[str] = Field(default_factory=list)

    async def add(self, url: str):
        # Determine the type of vision embedding to use
--- a/common/optional_dependencies.py
+++ b/common/optional_dependencies.py
@@ -14,14 +14,13 @@ class DependenciesModel(BaseModel):
    torch: bool
    exllamav2: bool
    flash_attn: bool
-    outlines: bool
    infinity_emb: bool
    sentence_transformers: bool

    @computed_field
    @property
    def extras(self) -> bool:
-        return self.outlines and self.infinity_emb and self.sentence_transformers
+        return self.infinity_emb and self.sentence_transformers

    @computed_field
    @property
--- a/common/sampling.py
+++ b/common/sampling.py
@@ -25,7 +25,9 @@ class BaseSamplerRequest(BaseModel):

    max_tokens: Optional[int] = Field(
        default_factory=lambda: get_default_sampler_value("max_tokens"),
-        validation_alias=AliasChoices("max_tokens", "max_length"),
+        validation_alias=AliasChoices(
+            "max_tokens", "max_completion_tokens", "max_length"
+        ),
        description="Aliases: max_length",
        examples=[150],
        ge=0,
--- a/endpoints/Kobold/utils/generation.py
+++ b/endpoints/Kobold/utils/generation.py
@@ -2,7 +2,7 @@ import asyncio
 from asyncio import CancelledError
 from fastapi import HTTPException, Request
 from loguru import logger
-from sse_starlette import ServerSentEvent
+from sse_starlette.event import ServerSentEvent

 from common import model
 from common.networking import (
--- a/endpoints/OAI/types/embedding.py
+++ b/endpoints/OAI/types/embedding.py
@@ -27,7 +27,7 @@ class EmbeddingsRequest(BaseModel):

 class EmbeddingObject(BaseModel):
    object: str = Field("embedding", description="Type of the object.")
-    embedding: List[float] = Field(
+    embedding: Union[List[float], str] = Field(
        ..., description="Embedding values as a list of floats."
    )
    index: int = Field(
--- a/endpoints/OAI/utils/chat_completion.py
+++ b/endpoints/OAI/utils/chat_completion.py
@@ -454,16 +454,23 @@ async def generate_tool_calls(
        if gen["stop_str"] in tool_data.tool_call_start:
            if "text" in gen:
                # non streaming, all generations will have the text they generated
-                pre_tool_prompt = await apply_chat_template(data, gen["text"])
+                pre_tool_prompt, mm_embeddings = await apply_chat_template(
+                    data, gen["text"]
+                )
            elif current_generations is not None:
                # streaming, we wont have text in the generation,
                # we'll have to use the current_generations
-                pre_tool_prompt = await apply_chat_template(data, current_generations)
+                pre_tool_prompt, mm_embeddings = await apply_chat_template(
+                    data, current_generations
+                )

            gen_tasks.append(
                asyncio.create_task(
                    model.container.generate(
-                        pre_tool_prompt, request.state.id, **gen_params
+                        pre_tool_prompt,
+                        request.state.id,
+                        embeddings=mm_embeddings,
+                        **gen_params,
                    )
                )
            )
--- a/endpoints/core/router.py
+++ b/endpoints/core/router.py
@@ -4,6 +4,7 @@ from sys import maxsize
 from typing import Optional
 from common.multimodal import MultimodalEmbeddingWrapper
 from fastapi import APIRouter, Depends, HTTPException, Request, Response
+from fastapi.responses import JSONResponse
 from sse_starlette import EventSourceResponse

 from common import model, sampling
@@ -22,9 +23,11 @@ from endpoints.core.types.lora import LoraList, LoraLoadRequest, LoraLoadRespons
 from endpoints.core.types.model import (
    EmbeddingModelLoadRequest,
    ModelCard,
+    ModelDefaultGenerationSettings,
    ModelList,
    ModelLoadRequest,
    ModelLoadResponse,
+    ModelPropsResponse,
 )
 from endpoints.core.types.health import HealthCheckResponse
 from endpoints.core.types.sampler_overrides import (
@@ -65,6 +68,34 @@ async def healthcheck(response: Response) -> HealthCheckResponse:
    )


+@router.get("/.well-known/serviceinfo")
+async def service_info():
+    return JSONResponse(
+        content={
+            "version": 0.1,
+            "software": {
+                "name": "TabbyAPI",
+                "repository": "https://github.com/theroyallab/tabbyAPI",
+                "homepage": "https://github.com/theroyallab/tabbyAPI",
+            },
+            "api": {
+                "openai": {
+                    "name": "OpenAI API",
+                    "relative_url": "/v1",
+                    "documentation": "https://theroyallab.github.io/tabbyAPI",
+                    "version": 1,
+                },
+                "koboldai": {
+                    "name": "KoboldAI API",
+                    "relative_url": "/api",
+                    "documentation": "https://theroyallab.github.io/tabbyAPI",
+                    "version": 1,
+                },
+            },
+        }
+    )
+
+
 # Model list endpoint
@router.get("/v1/models", dependencies=[Depends(check_api_key)])
@router.get("/v1/model/list", dependencies=[Depends(check_api_key)])
@@ -102,6 +133,30 @@ async def current_model() -> ModelCard:
    return get_current_model()


+@router.get(
+    "/props", dependencies=[Depends(check_api_key), Depends(check_model_container)]
+)
+async def model_props() -> ModelPropsResponse:
+    """
+    Returns specific properties of a model for clients.
+
+    To get all properties, use /v1/model instead.
+    """
+
+    current_model_card = get_current_model()
+    resp = ModelPropsResponse(
+        total_slots=current_model_card.parameters.max_batch_size,
+        default_generation_settings=ModelDefaultGenerationSettings(
+            n_ctx=current_model_card.parameters.max_seq_len,
+        ),
+    )
+
+    if current_model_card.parameters.prompt_template_content:
+        resp.chat_template = current_model_card.parameters.prompt_template_content
+
+    return resp
+
+
@router.get("/v1/model/draft/list", dependencies=[Depends(check_api_key)])
 async def list_draft_models(request: Request) -> ModelList:
    """
--- a/endpoints/core/types/model.py
+++ b/endpoints/core/types/model.py
@@ -16,10 +16,12 @@ class ModelCardParameters(BaseModel):
    max_seq_len: Optional[int] = None
    rope_scale: Optional[float] = 1.0
    rope_alpha: Optional[float] = 1.0
+    max_batch_size: Optional[int] = 1
    cache_size: Optional[int] = None
    cache_mode: Optional[str] = "FP16"
    chunk_size: Optional[int] = 2048
    prompt_template: Optional[str] = None
+    prompt_template_content: Optional[str] = None
    num_experts_per_token: Optional[int] = None
    use_vision: Optional[bool] = False

@@ -139,3 +141,17 @@ class ModelLoadResponse(BaseModel):
    module: int
    modules: int
    status: str
+
+
+class ModelDefaultGenerationSettings(BaseModel):
+    """Contains default generation settings for model props."""
+
+    n_ctx: int
+
+
+class ModelPropsResponse(BaseModel):
+    """Represents a model props response."""
+
+    total_slots: int = 1
+    chat_template: str = ""
+    default_generation_settings: ModelDefaultGenerationSettings
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -16,30 +16,30 @@ version = "0.0.1"
 description = "An OAI compatible exllamav2 API that's both lightweight and fast"
 requires-python = ">=3.10"
 dependencies = [
-    "fastapi-slim >= 0.110.0",
+    "fastapi-slim >= 0.115",
    "pydantic >= 2.0.0",
    "ruamel.yaml",
    "rich",
    "uvicorn >= 0.28.1",
    "jinja2 >= 3.0.0",
    "loguru",
-    "sse-starlette",
+    "sse-starlette >= 2.2.0",
    "packaging",
-    "tokenizers",
-    "lm-format-enforcer >= 0.9.6",
+    "tokenizers >= 0.21.0",
+    "formatron >= 0.4.11",
+    "kbnf >= 0.4.1",
    "aiofiles",
    "aiohttp",
    "async_lru",
    "huggingface_hub",
    "psutil",
-    "httptools>=0.5.0",
+    "httptools >= 0.5.0",
    "pillow",

    # Improved asyncio loops
    "uvloop ; platform_system == 'Linux' and platform_machine == 'x86_64'",
    "winloop ; platform_system == 'Windows'",

-    # TEMP: Remove once 2.x is fixed in upstream
    "numpy < 2.0.0",

    # For python 3.12
@@ -53,7 +53,6 @@ dependencies = [
 [project.optional-dependencies]
 extras = [
    # Heavy dependencies that aren't for everyday use
-    "outlines",
    "infinity-emb",
    "sentence-transformers",
 ]
@@ -62,68 +61,46 @@ dev = [
 ]
 cu121 = [
    # Torch (Extra index URLs not support in pyproject.toml)
-    "torch @ https://download.pytorch.org/whl/cu121/torch-2.4.1%2Bcu121-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'",
-    "torch @ https://download.pytorch.org/whl/cu121/torch-2.4.1%2Bcu121-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'",
-    "torch @ https://download.pytorch.org/whl/cu121/torch-2.4.1%2Bcu121-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'",
-    "torch @ https://download.pytorch.org/whl/cu121/torch-2.4.1%2Bcu121-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'",
-    "torch @ https://download.pytorch.org/whl/cu121/torch-2.4.1%2Bcu121-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'",
-    "torch @ https://download.pytorch.org/whl/cu121/torch-2.4.1%2Bcu121-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",
+    "torch @ https://download.pytorch.org/whl/cu121/torch-2.5.1%2Bcu121-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'",
+    "torch @ https://download.pytorch.org/whl/cu121/torch-2.5.1%2Bcu121-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'",
+    "torch @ https://download.pytorch.org/whl/cu121/torch-2.5.1%2Bcu121-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'",
+    "torch @ https://download.pytorch.org/whl/cu121/torch-2.5.1%2Bcu121-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'",
+    "torch @ https://download.pytorch.org/whl/cu121/torch-2.5.1%2Bcu121-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'",
+    "torch @ https://download.pytorch.org/whl/cu121/torch-2.5.1%2Bcu121-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",

    # Exl2
-    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu121.torch2.4.0-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'",
-    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu121.torch2.4.0-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'",
-    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu121.torch2.4.0-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'",
-    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu121.torch2.4.0-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'",
-    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu121.torch2.4.0-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'",
-    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu121.torch2.4.0-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",
+    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7+cu121.torch2.5.0-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'",
+    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7+cu121.torch2.5.0-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'",
+    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7+cu121.torch2.5.0-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'",
+    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7+cu121.torch2.5.0-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'",
+    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7+cu121.torch2.5.0-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'",
+    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7+cu121.torch2.5.0-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",

    # Windows FA2 from https://github.com/bdashore3/flash-attention/releases
-    "flash_attn @ https://github.com/bdashore3/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4.0cxx11abiFALSE-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'",
-    "flash_attn @ https://github.com/bdashore3/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4.0cxx11abiFALSE-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'",
-    "flash_attn @ https://github.com/bdashore3/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4.0cxx11abiFALSE-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'",
+    "flash_attn @ https://github.com/bdashore3/flash-attention/releases/download/v2.7.0.post2/flash_attn-2.7.0.post2+cu124torch2.5.1cxx11abiFALSE-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'",
+    "flash_attn @ https://github.com/bdashore3/flash-attention/releases/download/v2.7.0.post2/flash_attn-2.7.0.post2+cu124torch2.5.1cxx11abiFALSE-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'",
+    "flash_attn @ https://github.com/bdashore3/flash-attention/releases/download/v2.7.0.post2/flash_attn-2.7.0.post2+cu124torch2.5.1cxx11abiFALSE-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'",

    # Linux FA2 from https://github.com/Dao-AILab/flash-attention/releases
-    "flash_attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4cxx11abiFALSE-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'",
-    "flash_attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4cxx11abiFALSE-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'",
-    "flash_attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",
-]
-cu118 = [
-    # Torch
-    "torch @ https://download.pytorch.org/whl/cu118/torch-2.4.1%2Bcu118-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'",
-    "torch @ https://download.pytorch.org/whl/cu118/torch-2.4.1%2Bcu118-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'",
-    "torch @ https://download.pytorch.org/whl/cu118/torch-2.4.1%2Bcu118-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'",
-    "torch @ https://download.pytorch.org/whl/cu118/torch-2.4.1%2Bcu118-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'",
-    "torch @ https://download.pytorch.org/whl/cu118/torch-2.4.1%2Bcu118-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'",
-    "torch @ https://download.pytorch.org/whl/cu118/torch-2.4.1%2Bcu118-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",
-
-    # Exl2
-    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu118.torch2.4.0-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'",
-    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu118.torch2.4.0-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'",
-    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu118.torch2.4.0-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'",
-    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu118.torch2.4.0-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'",
-    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu118.torch2.4.0-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'",
-    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu118.torch2.4.0-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",
-
-    # Linux FA2 from https://github.com/Dao-AILab/flash-attention/releases
-    "flash_attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu118torch2.4cxx11abiFALSE-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'",
-    "flash_attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu118torch2.4cxx11abiFALSE-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'",
-    "flash_attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu118torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",
+    "flash_attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.0.post2/flash_attn-2.7.0.post2+cu12torch2.5cxx11abiFALSE-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'",
+    "flash_attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.0.post2/flash_attn-2.7.0.post2+cu12torch2.5cxx11abiFALSE-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'",
+    "flash_attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.0.post2/flash_attn-2.7.0.post2+cu12torch2.5cxx11abiFALSE-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",
 ]
 amd = [
    # Torch triton for ROCm
-    "pytorch_triton_rocm @ https://download.pytorch.org/whl/pytorch_triton_rocm-3.0.0-cp312-cp312-linux_x86_64.whl ; python_version == '3.12'",
-    "pytorch_triton_rocm @ https://download.pytorch.org/whl/pytorch_triton_rocm-3.0.0-cp311-cp311-linux_x86_64.whl ; python_version == '3.11'",
-    "pytorch_triton_rocm @ https://download.pytorch.org/whl/pytorch_triton_rocm-3.0.0-cp310-cp310-linux_x86_64.whl ; python_version == '3.10'",
+    "pytorch_triton_rocm @ https://download.pytorch.org/whl/pytorch_triton_rocm-3.1.0-cp312-cp312-linux_x86_64.whl ; python_version == '3.12'",
+    "pytorch_triton_rocm @ https://download.pytorch.org/whl/pytorch_triton_rocm-3.1.0-cp311-cp311-linux_x86_64.whl ; python_version == '3.11'",
+    "pytorch_triton_rocm @ https://download.pytorch.org/whl/pytorch_triton_rocm-3.1.0-cp310-cp310-linux_x86_64.whl ; python_version == '3.10'",

    # Torch
-    "torch @ https://download.pytorch.org/whl/rocm6.0/torch-2.4.1%2Brocm6.0-cp312-cp312-linux_x86_64.whl ; python_version == '3.12'",
-    "torch @ https://download.pytorch.org/whl/rocm6.0/torch-2.4.1%2Brocm6.0-cp311-cp311-linux_x86_64.whl ; python_version == '3.11'",
-    "torch @ https://download.pytorch.org/whl/rocm6.0/torch-2.4.1%2Brocm6.0-cp310-cp310-linux_x86_64.whl ; python_version == '3.10'",
+    "torch @ https://download.pytorch.org/whl/rocm6.2/torch-2.5.1%2Brocm6.2-cp312-cp312-linux_x86_64.whl ; python_version == '3.12'",
+    "torch @ https://download.pytorch.org/whl/rocm6.2/torch-2.5.1%2Brocm6.2-cp311-cp311-linux_x86_64.whl ; python_version == '3.11'",
+    "torch @ https://download.pytorch.org/whl/rocm6.2/torch-2.5.1%2Brocm6.2-cp310-cp310-linux_x86_64.whl ; python_version == '3.10'",

    # Exl2
-    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+rocm6.1.torch2.4.0-cp312-cp312-linux_x86_64.whl ; python_version == '3.12'",
-    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+rocm6.1.torch2.4.0-cp311-cp311-linux_x86_64.whl ; python_version == '3.11'",
-    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+rocm6.1.torch2.4.0-cp310-cp310-linux_x86_64.whl ; python_version == '3.10'",
+    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7+rocm6.2.torch2.5.0-cp312-cp312-linux_x86_64.whl ; python_version == '3.12'",
+    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7+rocm6.2.torch2.5.0-cp311-cp311-linux_x86_64.whl ; python_version == '3.11'",
+    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7+rocm6.2.torch2.5.0-cp310-cp310-linux_x86_64.whl ; python_version == '3.10'",
 ]

 # MARK: Ruff options
--- a/start.py
+++ b/start.py
@@ -47,7 +47,7 @@ def get_install_features(lib_name: str = None):
        # Ask the user for the GPU lib
        gpu_lib_choices = {
            "A": {"pretty": "NVIDIA Cuda 12.x", "internal": "cu121"},
-            "B": {"pretty": "NVIDIA Cuda 11.8", "internal": "cu118"},
+            "B": {"pretty": "NVIDIA Cuda 11.8 (Unsupported)", "internal": "cu118"},
            "C": {"pretty": "AMD", "internal": "amd"},
        }
        user_input = get_user_choice(
--- a/templates/tool_calls/chatml_with_headers.jinja
+++ b/templates/tool_calls/chatml_with_headers.jinja
@@ -6,7 +6,7 @@
 {%- set start_header = "<|start_header_id|>" -%}
 {%- set end_header = "<|end_header_id|>\n" -%}

-{%- set example_tool_call -%}[
+{%- set example_tool_call = '[
    {
        "id": "tool_id_1342",
        "function": {
@@ -23,29 +23,26 @@
        },
        "type": "function"
    }
-]
-{%- endset -%}
+]' -%}

-{%- set inital_system_prompt -%}You are an assistant that has access to the following set of tools, to call a tool:
-1. Prefix calls with '{{ tool_start }}' and end calls with '{{ tool_end }}'
+{%- set inital_system_prompt = 'You are an assistant that has access to the following set of tools, to call a tool:
+1. Prefix calls with ' + tool_start + ' and end calls with ' + tool_end + '
 2. Ensure you use the correct type for arguments. For example, if the argument is a string, ensure it is enclosed in quotes, otherwise, it should not be.
 3. Generate all calls using the following json tool call format. Here is a multi tool call example:

-{{ tool_start }}{{ example_tool_call }}{{ tool_end }}
+' + tool_start + example_tool_call + tool_end + '

 Here are the tools available for you to call:
-{{ tools_json }}
-{%- endset -%}
+' + tools_json -%}

-{%- set tool_reminder -%}Available Tools:
-{{ tools_json }}
+{%- set tool_reminder = 'Available Tools:
+' + tools_json + '

 Tool Call Format Example:
-{{ tool_start }}{{ example_tool_call }}
+' + tool_start + example_tool_call + '

-Prefix & Suffix: Begin tool calls with {{ tool_start }} and end with {{ tool_end }}.
-Argument Types: Use correct data types for arguments (e.g., strings in quotes, numbers without).
-{%- endset -%}
+Prefix & Suffix: Begin tool calls with ' + tool_start + ' and end with ' + tool_end + '.
+Argument Types: Use correct data types for arguments (e.g., strings in quotes, numbers without).' -%}

 {# Template #}

@@ -54,15 +51,15 @@ Argument Types: Use correct data types for arguments (e.g., strings in quotes, n
    {%- if role not in message_roles -%}
        {{ raise_exception('Invalid role ' + message['role'] + '. Only ' + message_roles | join(', ') + ' are supported.') }}
    {%- endif -%}
-    
-    {%- set content = message['content'] | default('', true) | trim -%}
+
+    {%- set content = message['content'] if message['content'] is defined else '' | trim -%}
    {%- if loop.first -%}
 {{ bos_token }}{{ start_header }}{{ role }}{{ end_header }}
 {{ inital_system_prompt }}

 {{ content }}{{ eos_token }}
    {%- endif -%}
-    
+
    {%- if not loop.first -%}
 {{ start_header }}{{ role }}{{ end_header }}
 {{ content }}
@@ -81,4 +78,4 @@ Argument Types: Use correct data types for arguments (e.g., strings in quotes, n
 {{ tool_precursor }}{{ tool_start }}
 {%- else -%}
 {{ start_header }}assistant{{ end_header }}
-{%- endif -%}
+{%- endif -%}