mirror of
https://github.com/theroyallab/tabbyAPI.git
synced 2026-03-14 15:57:27 +00:00
Merge branch 'main' into draft-split
This commit is contained in:
@@ -1,110 +1,16 @@
|
||||
import traceback
|
||||
from exllamav2 import ExLlamaV2, ExLlamaV2Tokenizer
|
||||
from exllamav2.generator.filters import ExLlamaV2Filter, ExLlamaV2PrefixFilter
|
||||
from lmformatenforcer import (
|
||||
JsonSchemaParser,
|
||||
RegexParser,
|
||||
TokenEnforcer,
|
||||
CharacterLevelParser,
|
||||
)
|
||||
from lmformatenforcer.integrations.exllamav2 import (
|
||||
build_token_enforcer_tokenizer_data,
|
||||
)
|
||||
from loguru import logger
|
||||
from typing import List
|
||||
import typing
|
||||
from functools import lru_cache
|
||||
from typing import List
|
||||
|
||||
|
||||
class OutlinesTokenizerWrapper:
|
||||
"""Wrapper for Outlines tokenizer"""
|
||||
|
||||
def __init__(self, tokenizer):
|
||||
self.tokenizer = tokenizer
|
||||
id_to_piece = self.tokenizer.get_id_to_piece_list()
|
||||
self.vocabulary = {piece: idx for idx, piece in enumerate(id_to_piece)}
|
||||
self.eos_token_id = self.tokenizer.eos_token_id
|
||||
self.eos_token = id_to_piece[self.tokenizer.eos_token_id]
|
||||
self.special_tokens = list(self.tokenizer.extended_id_to_piece.keys())
|
||||
|
||||
def convert_token_to_string(self, token):
|
||||
return token
|
||||
|
||||
def decode(self, tokens):
|
||||
s = ""
|
||||
id_to_piece = self.tokenizer.get_id_to_piece_list()
|
||||
for t in tokens:
|
||||
s += id_to_piece[t]
|
||||
return s
|
||||
|
||||
|
||||
class ExLlamaV2EbnfFilter(ExLlamaV2Filter):
|
||||
"""Filter class for context-free grammar via outlines"""
|
||||
|
||||
def __init__(self, model, tokenizer, grammar):
|
||||
from outlines.fsm.fsm import CFGFSM
|
||||
|
||||
super().__init__(model, tokenizer)
|
||||
|
||||
self.wrapped_tokenizer = OutlinesTokenizerWrapper(tokenizer)
|
||||
self.fsm = CFGFSM(grammar, self.wrapped_tokenizer)
|
||||
self.state = self.fsm.first_state
|
||||
|
||||
def begin(self, prefix_str=""):
|
||||
self.state = self.fsm.first_state
|
||||
|
||||
def feed(self, token):
|
||||
self.state = self.fsm.next_state(self.state, token.item())
|
||||
|
||||
def next(self):
|
||||
return self.fsm.allowed_token_ids(self.state), set()
|
||||
|
||||
def use_background_worker(self):
|
||||
return True
|
||||
|
||||
|
||||
@lru_cache(10)
|
||||
def _get_lmfe_tokenizer_data(tokenizer: ExLlamaV2Tokenizer):
|
||||
return build_token_enforcer_tokenizer_data(tokenizer)
|
||||
|
||||
|
||||
class ExLlamaV2TokenEnforcerFilter(ExLlamaV2Filter):
|
||||
"""Filter class for LMFE"""
|
||||
|
||||
token_sequence: List[int]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model: ExLlamaV2,
|
||||
tokenizer: ExLlamaV2Tokenizer,
|
||||
character_level_parser: CharacterLevelParser,
|
||||
):
|
||||
super().__init__(model, tokenizer)
|
||||
tokenizer_data = _get_lmfe_tokenizer_data(tokenizer)
|
||||
self.token_enforcer = TokenEnforcer(tokenizer_data, character_level_parser)
|
||||
self.token_sequence = []
|
||||
|
||||
def begin(self, prefix_str: str):
|
||||
self.token_sequence = []
|
||||
|
||||
def feed(self, token):
|
||||
self.token_sequence.append(int(token[0][0]))
|
||||
|
||||
def next(self):
|
||||
allowed_tokens = self.token_enforcer.get_allowed_tokens(self.token_sequence)
|
||||
if not hasattr(self, "allow_return_type_list"):
|
||||
return set(allowed_tokens), set()
|
||||
else:
|
||||
return sorted(allowed_tokens), []
|
||||
|
||||
def use_background_worker(self):
|
||||
return True
|
||||
|
||||
|
||||
def clear_grammar_func_cache():
|
||||
"""Flush tokenizer_data cache to avoid holding references to
|
||||
tokenizers after unloading a model"""
|
||||
|
||||
_get_lmfe_tokenizer_data.cache_clear()
|
||||
import torch
|
||||
from exllamav2 import ExLlamaV2, ExLlamaV2Tokenizer
|
||||
from exllamav2.generator.filters import ExLlamaV2Filter
|
||||
from formatron.extractor import NonterminalExtractor
|
||||
from formatron.formatter import FormatterBuilder
|
||||
from formatron.integrations.exllamav2 import FormatterFilter, create_engine_vocabulary
|
||||
from formatron.schemas import json_schema
|
||||
from loguru import logger
|
||||
|
||||
|
||||
class ExLlamaV2Grammar:
|
||||
@@ -117,7 +23,7 @@ class ExLlamaV2Grammar:
|
||||
|
||||
def add_json_schema_filter(
|
||||
self,
|
||||
json_schema: dict,
|
||||
schema: dict,
|
||||
model: ExLlamaV2,
|
||||
tokenizer: ExLlamaV2Tokenizer,
|
||||
):
|
||||
@@ -125,7 +31,16 @@ class ExLlamaV2Grammar:
|
||||
|
||||
# Create the parser
|
||||
try:
|
||||
schema_parser = JsonSchemaParser(json_schema)
|
||||
# Add fields required by formatron if not present
|
||||
if "$id" not in schema:
|
||||
schema["$id"] = "https://example.com/example.json"
|
||||
if "$schema" not in schema:
|
||||
schema["$schema"] = "http://json-schema.org/draft-07/schema#"
|
||||
|
||||
# Validate schema and create formatter
|
||||
schema = json_schema.create_schema(schema)
|
||||
f = FormatterBuilder()
|
||||
f.append_line(f"{f.json(schema)}")
|
||||
except Exception:
|
||||
traceback.print_exc()
|
||||
logger.error(
|
||||
@@ -135,14 +50,10 @@ class ExLlamaV2Grammar:
|
||||
|
||||
return
|
||||
|
||||
# Allow JSON objects or JSON arrays at the top level
|
||||
json_prefixes = ["[", "{"]
|
||||
|
||||
lmfilter = ExLlamaV2TokenEnforcerFilter(model, tokenizer, schema_parser)
|
||||
prefix_filter = ExLlamaV2PrefixFilter(model, tokenizer, json_prefixes)
|
||||
lmfilter = _create_formatter_filter(model, tokenizer, f)
|
||||
|
||||
# Append the filters
|
||||
self.filters.extend([lmfilter, prefix_filter])
|
||||
self.filters.append(lmfilter)
|
||||
|
||||
def add_regex_filter(
|
||||
self,
|
||||
@@ -154,7 +65,9 @@ class ExLlamaV2Grammar:
|
||||
|
||||
# Create the parser
|
||||
try:
|
||||
pattern_parser = RegexParser(pattern)
|
||||
# Validate regex and create formatter
|
||||
f = FormatterBuilder()
|
||||
f.append_line(f"{f.regex(pattern)}")
|
||||
except Exception:
|
||||
traceback.print_exc()
|
||||
logger.error(
|
||||
@@ -164,32 +77,82 @@ class ExLlamaV2Grammar:
|
||||
|
||||
return
|
||||
|
||||
lmfilter = ExLlamaV2TokenEnforcerFilter(model, tokenizer, pattern_parser)
|
||||
lmfilter = _create_formatter_filter(model, tokenizer, f)
|
||||
|
||||
# Append the filters
|
||||
self.filters.append(lmfilter)
|
||||
|
||||
def add_ebnf_filter(
|
||||
def add_kbnf_filter(
|
||||
self,
|
||||
ebnf_string: str,
|
||||
kbnf_string: str,
|
||||
model: ExLlamaV2,
|
||||
tokenizer: ExLlamaV2Tokenizer,
|
||||
):
|
||||
"""
|
||||
Add an EBNF grammar filter.
|
||||
Possibly replace outlines with an in-house solution in the future.
|
||||
"""
|
||||
"""Adds an ExllamaV2 filter based on KBNF grammar."""
|
||||
|
||||
# Create the parser
|
||||
try:
|
||||
ebnf_filter = ExLlamaV2EbnfFilter(model, tokenizer, ebnf_string)
|
||||
except ImportError:
|
||||
# Validate KBNF and create formatter
|
||||
f = FormatterBuilder()
|
||||
f.append_line(
|
||||
f"""{f.extractor(lambda nonterminal:
|
||||
CFGExtractor(nonterminal, kbnf_string))}"""
|
||||
)
|
||||
except Exception:
|
||||
logger.error(
|
||||
"Skipping EBNF parsing because Outlines is not installed.\n"
|
||||
"Please run the following command in your environment "
|
||||
"to install extra packages:\n"
|
||||
"pip install -U .[extras]"
|
||||
"Skipping because the KBNF string couldn't be parsed. "
|
||||
"Please read the above error for more information."
|
||||
)
|
||||
|
||||
return
|
||||
|
||||
self.filters.append(ebnf_filter)
|
||||
lmfilter = _create_formatter_filter(model, tokenizer, f)
|
||||
|
||||
# Append the filters
|
||||
self.filters.append(lmfilter)
|
||||
|
||||
|
||||
class CFGExtractor(NonterminalExtractor):
|
||||
"""Extractor class for KBNF context-free grammar"""
|
||||
|
||||
def __init__(self, nonterminal: str, kbnf_string: str):
|
||||
super().__init__(nonterminal)
|
||||
self.kbnf_string = kbnf_string
|
||||
|
||||
# Return the entire input string as the extracted string
|
||||
def extract(self, input_str: str) -> typing.Optional[tuple[str, typing.Any]]:
|
||||
return "", input_str
|
||||
|
||||
@property
|
||||
def kbnf_definition(self) -> str:
|
||||
return self.kbnf_string.replace("start", self.nonterminal)
|
||||
|
||||
|
||||
@lru_cache(1)
|
||||
def _create_cached_engine_vocabulary(tokenizer: ExLlamaV2Tokenizer):
|
||||
"""Build and cache engine vocabulary on first grammar run"""
|
||||
|
||||
return create_engine_vocabulary(tokenizer)
|
||||
|
||||
|
||||
def _create_formatter_filter(
|
||||
model: ExLlamaV2, tokenizer: ExLlamaV2Tokenizer, formatter_builder: FormatterBuilder
|
||||
) -> ExLlamaV2Filter:
|
||||
"""
|
||||
Create a formatter filter for the ExLlamaV2 engine.
|
||||
Minimalist clone of formatron.integrations.exllamav2.create_formatter_filter
|
||||
with lru_cache enabled for engine vocabulary
|
||||
"""
|
||||
|
||||
vocab = _create_cached_engine_vocabulary(tokenizer)
|
||||
f = formatter_builder.build(
|
||||
vocab, lambda tokens: tokenizer.decode(torch.tensor(tokens))
|
||||
)
|
||||
return FormatterFilter(model, tokenizer, f)
|
||||
|
||||
|
||||
def clear_grammar_func_cache():
|
||||
"""Flush tokenizer_data cache to avoid holding references to
|
||||
tokenizers after unloading a model"""
|
||||
|
||||
_create_cached_engine_vocabulary.cache_clear()
|
||||
|
||||
@@ -498,16 +498,18 @@ class ExllamaV2Container:
|
||||
"rope_scale": self.config.scale_pos_emb,
|
||||
"rope_alpha": self.config.scale_alpha_value,
|
||||
"max_seq_len": self.config.max_seq_len,
|
||||
"max_batch_size": self.max_batch_size,
|
||||
"cache_size": self.cache_size,
|
||||
"cache_mode": self.cache_mode,
|
||||
"chunk_size": self.config.max_input_len,
|
||||
"num_experts_per_token": self.config.num_experts_per_token,
|
||||
"prompt_template": self.prompt_template.name
|
||||
if self.prompt_template
|
||||
else None,
|
||||
"use_vision": self.use_vision,
|
||||
}
|
||||
|
||||
if self.prompt_template:
|
||||
model_params["prompt_template"] = self.prompt_template.name
|
||||
model_params["prompt_template_content"] = self.prompt_template.raw_template
|
||||
|
||||
if self.draft_config:
|
||||
draft_model_params = {
|
||||
"name": self.draft_model_dir.name,
|
||||
@@ -787,6 +789,10 @@ class ExllamaV2Container:
|
||||
max_batch_size=self.max_batch_size,
|
||||
paged=self.paged,
|
||||
)
|
||||
|
||||
# Update the state of the container var
|
||||
if self.max_batch_size is None:
|
||||
self.max_batch_size = self.generator.generator.max_batch_size
|
||||
finally:
|
||||
# This means the generator is being recreated
|
||||
# The load lock is already released in the load function
|
||||
@@ -1222,7 +1228,7 @@ class ExllamaV2Container:
|
||||
# Add EBNF filter if it exists
|
||||
grammar_string = unwrap(kwargs.get("grammar_string"))
|
||||
if grammar_string:
|
||||
grammar_handler.add_ebnf_filter(grammar_string, self.model, self.tokenizer)
|
||||
grammar_handler.add_kbnf_filter(grammar_string, self.model, self.tokenizer)
|
||||
|
||||
# Set banned strings
|
||||
banned_strings: List[str] = unwrap(kwargs.get("banned_strings"), [])
|
||||
@@ -1329,17 +1335,49 @@ class ExllamaV2Container:
|
||||
|
||||
# The first index will always be the positive prompt
|
||||
context_len = input_ids[0].size(dim=-1)
|
||||
if context_len > self.config.max_seq_len:
|
||||
raise ValueError(
|
||||
f"Context length {context_len} is greater than max_seq_len "
|
||||
f"{self.config.max_seq_len}"
|
||||
)
|
||||
|
||||
# The second index will be the negative prompt if CFG is enabled
|
||||
negative_context_len = input_ids[1].size(dim=-1) if negative_prompt else 0
|
||||
|
||||
# Automatically set max_tokens to fill up the context
|
||||
# This should be an OK default, but may be changed in the future
|
||||
max_tokens = unwrap(
|
||||
kwargs.get("max_tokens"), self.config.max_seq_len - context_len
|
||||
kwargs.get("max_tokens"),
|
||||
self.config.max_seq_len - max(context_len, negative_context_len),
|
||||
)
|
||||
if max_tokens < 1:
|
||||
logger.warning("max_tokens must be a positive integer, setting to 1.")
|
||||
max_tokens = 1
|
||||
|
||||
# Determine if the negative context or the context length is bigger
|
||||
context_to_check = max(negative_context_len, context_len)
|
||||
|
||||
# Check highest possible total length of request
|
||||
if context_to_check + max_tokens > self.config.max_seq_len:
|
||||
preamble = (
|
||||
"Negative prompt request"
|
||||
if negative_context_len > context_len
|
||||
else "Request"
|
||||
)
|
||||
|
||||
raise ValueError(
|
||||
f"{preamble} length {context_to_check} + {max_tokens} is greater than "
|
||||
f"max_seq_len {self.config.max_seq_len}"
|
||||
)
|
||||
|
||||
# Check total required pages for CFG request to avoid overallocation
|
||||
if negative_prompt and (
|
||||
sum(
|
||||
256 * math.ceil((context + max_tokens) / 256)
|
||||
for context in (context_len, negative_context_len)
|
||||
)
|
||||
> self.cache_size
|
||||
):
|
||||
raise ValueError(
|
||||
f"Total required page size for request "
|
||||
f"{context_len} + {negative_context_len} + {max_tokens} * 2 "
|
||||
f"is greater than cache_size {self.cache_size}"
|
||||
)
|
||||
|
||||
# Set min_tokens to generate while keeping EOS banned
|
||||
min_tokens = unwrap(kwargs.get("min_tokens"), 0)
|
||||
|
||||
@@ -52,6 +52,10 @@ def _log_formatter(record: dict):
|
||||
"ERROR": "red",
|
||||
"CRITICAL": "bold white on red",
|
||||
}
|
||||
|
||||
time = record.get("time")
|
||||
colored_time = f"[grey37]{time:YYYY-DD-MM HH:mm:ss.SSS}[/grey37]"
|
||||
|
||||
level = record.get("level")
|
||||
level_color = color_map.get(level.name, "cyan")
|
||||
colored_level = f"[{level_color}]{level.name}[/{level_color}]:"
|
||||
@@ -69,9 +73,11 @@ def _log_formatter(record: dict):
|
||||
|
||||
fmt = ""
|
||||
if len(lines) > 1:
|
||||
fmt = "\n".join([f"{colored_level}{separator}{line}" for line in lines])
|
||||
fmt = "\n".join(
|
||||
[f"{colored_time} {colored_level}{separator}{line}" for line in lines]
|
||||
)
|
||||
else:
|
||||
fmt = f"{colored_level}{separator}{message}"
|
||||
fmt = f"{colored_time} {colored_level}{separator}{message}"
|
||||
|
||||
return fmt
|
||||
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
from typing import List
|
||||
from backends.exllamav2.vision import get_image_embedding
|
||||
from common import model
|
||||
from loguru import logger
|
||||
from pydantic import BaseModel, Field
|
||||
from typing import List
|
||||
|
||||
from common.optional_dependencies import dependencies
|
||||
|
||||
@@ -9,12 +10,12 @@ if dependencies.exllamav2:
|
||||
from exllamav2 import ExLlamaV2VisionTower
|
||||
|
||||
|
||||
class MultimodalEmbeddingWrapper:
|
||||
class MultimodalEmbeddingWrapper(BaseModel):
|
||||
"""Common multimodal embedding wrapper"""
|
||||
|
||||
type: str = None
|
||||
content: List = []
|
||||
text_alias: List[str] = []
|
||||
content: list = Field(default_factory=list)
|
||||
text_alias: List[str] = Field(default_factory=list)
|
||||
|
||||
async def add(self, url: str):
|
||||
# Determine the type of vision embedding to use
|
||||
|
||||
@@ -14,14 +14,13 @@ class DependenciesModel(BaseModel):
|
||||
torch: bool
|
||||
exllamav2: bool
|
||||
flash_attn: bool
|
||||
outlines: bool
|
||||
infinity_emb: bool
|
||||
sentence_transformers: bool
|
||||
|
||||
@computed_field
|
||||
@property
|
||||
def extras(self) -> bool:
|
||||
return self.outlines and self.infinity_emb and self.sentence_transformers
|
||||
return self.infinity_emb and self.sentence_transformers
|
||||
|
||||
@computed_field
|
||||
@property
|
||||
|
||||
@@ -25,7 +25,9 @@ class BaseSamplerRequest(BaseModel):
|
||||
|
||||
max_tokens: Optional[int] = Field(
|
||||
default_factory=lambda: get_default_sampler_value("max_tokens"),
|
||||
validation_alias=AliasChoices("max_tokens", "max_length"),
|
||||
validation_alias=AliasChoices(
|
||||
"max_tokens", "max_completion_tokens", "max_length"
|
||||
),
|
||||
description="Aliases: max_length",
|
||||
examples=[150],
|
||||
ge=0,
|
||||
|
||||
@@ -2,7 +2,7 @@ import asyncio
|
||||
from asyncio import CancelledError
|
||||
from fastapi import HTTPException, Request
|
||||
from loguru import logger
|
||||
from sse_starlette import ServerSentEvent
|
||||
from sse_starlette.event import ServerSentEvent
|
||||
|
||||
from common import model
|
||||
from common.networking import (
|
||||
|
||||
@@ -27,7 +27,7 @@ class EmbeddingsRequest(BaseModel):
|
||||
|
||||
class EmbeddingObject(BaseModel):
|
||||
object: str = Field("embedding", description="Type of the object.")
|
||||
embedding: List[float] = Field(
|
||||
embedding: Union[List[float], str] = Field(
|
||||
..., description="Embedding values as a list of floats."
|
||||
)
|
||||
index: int = Field(
|
||||
|
||||
@@ -454,16 +454,23 @@ async def generate_tool_calls(
|
||||
if gen["stop_str"] in tool_data.tool_call_start:
|
||||
if "text" in gen:
|
||||
# non streaming, all generations will have the text they generated
|
||||
pre_tool_prompt = await apply_chat_template(data, gen["text"])
|
||||
pre_tool_prompt, mm_embeddings = await apply_chat_template(
|
||||
data, gen["text"]
|
||||
)
|
||||
elif current_generations is not None:
|
||||
# streaming, we wont have text in the generation,
|
||||
# we'll have to use the current_generations
|
||||
pre_tool_prompt = await apply_chat_template(data, current_generations)
|
||||
pre_tool_prompt, mm_embeddings = await apply_chat_template(
|
||||
data, current_generations
|
||||
)
|
||||
|
||||
gen_tasks.append(
|
||||
asyncio.create_task(
|
||||
model.container.generate(
|
||||
pre_tool_prompt, request.state.id, **gen_params
|
||||
pre_tool_prompt,
|
||||
request.state.id,
|
||||
embeddings=mm_embeddings,
|
||||
**gen_params,
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
@@ -4,6 +4,7 @@ from sys import maxsize
|
||||
from typing import Optional
|
||||
from common.multimodal import MultimodalEmbeddingWrapper
|
||||
from fastapi import APIRouter, Depends, HTTPException, Request, Response
|
||||
from fastapi.responses import JSONResponse
|
||||
from sse_starlette import EventSourceResponse
|
||||
|
||||
from common import model, sampling
|
||||
@@ -22,9 +23,11 @@ from endpoints.core.types.lora import LoraList, LoraLoadRequest, LoraLoadRespons
|
||||
from endpoints.core.types.model import (
|
||||
EmbeddingModelLoadRequest,
|
||||
ModelCard,
|
||||
ModelDefaultGenerationSettings,
|
||||
ModelList,
|
||||
ModelLoadRequest,
|
||||
ModelLoadResponse,
|
||||
ModelPropsResponse,
|
||||
)
|
||||
from endpoints.core.types.health import HealthCheckResponse
|
||||
from endpoints.core.types.sampler_overrides import (
|
||||
@@ -65,6 +68,34 @@ async def healthcheck(response: Response) -> HealthCheckResponse:
|
||||
)
|
||||
|
||||
|
||||
@router.get("/.well-known/serviceinfo")
|
||||
async def service_info():
|
||||
return JSONResponse(
|
||||
content={
|
||||
"version": 0.1,
|
||||
"software": {
|
||||
"name": "TabbyAPI",
|
||||
"repository": "https://github.com/theroyallab/tabbyAPI",
|
||||
"homepage": "https://github.com/theroyallab/tabbyAPI",
|
||||
},
|
||||
"api": {
|
||||
"openai": {
|
||||
"name": "OpenAI API",
|
||||
"relative_url": "/v1",
|
||||
"documentation": "https://theroyallab.github.io/tabbyAPI",
|
||||
"version": 1,
|
||||
},
|
||||
"koboldai": {
|
||||
"name": "KoboldAI API",
|
||||
"relative_url": "/api",
|
||||
"documentation": "https://theroyallab.github.io/tabbyAPI",
|
||||
"version": 1,
|
||||
},
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
# Model list endpoint
|
||||
@router.get("/v1/models", dependencies=[Depends(check_api_key)])
|
||||
@router.get("/v1/model/list", dependencies=[Depends(check_api_key)])
|
||||
@@ -102,6 +133,30 @@ async def current_model() -> ModelCard:
|
||||
return get_current_model()
|
||||
|
||||
|
||||
@router.get(
|
||||
"/props", dependencies=[Depends(check_api_key), Depends(check_model_container)]
|
||||
)
|
||||
async def model_props() -> ModelPropsResponse:
|
||||
"""
|
||||
Returns specific properties of a model for clients.
|
||||
|
||||
To get all properties, use /v1/model instead.
|
||||
"""
|
||||
|
||||
current_model_card = get_current_model()
|
||||
resp = ModelPropsResponse(
|
||||
total_slots=current_model_card.parameters.max_batch_size,
|
||||
default_generation_settings=ModelDefaultGenerationSettings(
|
||||
n_ctx=current_model_card.parameters.max_seq_len,
|
||||
),
|
||||
)
|
||||
|
||||
if current_model_card.parameters.prompt_template_content:
|
||||
resp.chat_template = current_model_card.parameters.prompt_template_content
|
||||
|
||||
return resp
|
||||
|
||||
|
||||
@router.get("/v1/model/draft/list", dependencies=[Depends(check_api_key)])
|
||||
async def list_draft_models(request: Request) -> ModelList:
|
||||
"""
|
||||
|
||||
@@ -16,10 +16,12 @@ class ModelCardParameters(BaseModel):
|
||||
max_seq_len: Optional[int] = None
|
||||
rope_scale: Optional[float] = 1.0
|
||||
rope_alpha: Optional[float] = 1.0
|
||||
max_batch_size: Optional[int] = 1
|
||||
cache_size: Optional[int] = None
|
||||
cache_mode: Optional[str] = "FP16"
|
||||
chunk_size: Optional[int] = 2048
|
||||
prompt_template: Optional[str] = None
|
||||
prompt_template_content: Optional[str] = None
|
||||
num_experts_per_token: Optional[int] = None
|
||||
use_vision: Optional[bool] = False
|
||||
|
||||
@@ -139,3 +141,17 @@ class ModelLoadResponse(BaseModel):
|
||||
module: int
|
||||
modules: int
|
||||
status: str
|
||||
|
||||
|
||||
class ModelDefaultGenerationSettings(BaseModel):
|
||||
"""Contains default generation settings for model props."""
|
||||
|
||||
n_ctx: int
|
||||
|
||||
|
||||
class ModelPropsResponse(BaseModel):
|
||||
"""Represents a model props response."""
|
||||
|
||||
total_slots: int = 1
|
||||
chat_template: str = ""
|
||||
default_generation_settings: ModelDefaultGenerationSettings
|
||||
|
||||
@@ -16,30 +16,30 @@ version = "0.0.1"
|
||||
description = "An OAI compatible exllamav2 API that's both lightweight and fast"
|
||||
requires-python = ">=3.10"
|
||||
dependencies = [
|
||||
"fastapi-slim >= 0.110.0",
|
||||
"fastapi-slim >= 0.115",
|
||||
"pydantic >= 2.0.0",
|
||||
"ruamel.yaml",
|
||||
"rich",
|
||||
"uvicorn >= 0.28.1",
|
||||
"jinja2 >= 3.0.0",
|
||||
"loguru",
|
||||
"sse-starlette",
|
||||
"sse-starlette >= 2.2.0",
|
||||
"packaging",
|
||||
"tokenizers",
|
||||
"lm-format-enforcer >= 0.9.6",
|
||||
"tokenizers >= 0.21.0",
|
||||
"formatron >= 0.4.11",
|
||||
"kbnf >= 0.4.1",
|
||||
"aiofiles",
|
||||
"aiohttp",
|
||||
"async_lru",
|
||||
"huggingface_hub",
|
||||
"psutil",
|
||||
"httptools>=0.5.0",
|
||||
"httptools >= 0.5.0",
|
||||
"pillow",
|
||||
|
||||
# Improved asyncio loops
|
||||
"uvloop ; platform_system == 'Linux' and platform_machine == 'x86_64'",
|
||||
"winloop ; platform_system == 'Windows'",
|
||||
|
||||
# TEMP: Remove once 2.x is fixed in upstream
|
||||
"numpy < 2.0.0",
|
||||
|
||||
# For python 3.12
|
||||
@@ -53,7 +53,6 @@ dependencies = [
|
||||
[project.optional-dependencies]
|
||||
extras = [
|
||||
# Heavy dependencies that aren't for everyday use
|
||||
"outlines",
|
||||
"infinity-emb",
|
||||
"sentence-transformers",
|
||||
]
|
||||
@@ -62,68 +61,46 @@ dev = [
|
||||
]
|
||||
cu121 = [
|
||||
# Torch (Extra index URLs not support in pyproject.toml)
|
||||
"torch @ https://download.pytorch.org/whl/cu121/torch-2.4.1%2Bcu121-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'",
|
||||
"torch @ https://download.pytorch.org/whl/cu121/torch-2.4.1%2Bcu121-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'",
|
||||
"torch @ https://download.pytorch.org/whl/cu121/torch-2.4.1%2Bcu121-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'",
|
||||
"torch @ https://download.pytorch.org/whl/cu121/torch-2.4.1%2Bcu121-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'",
|
||||
"torch @ https://download.pytorch.org/whl/cu121/torch-2.4.1%2Bcu121-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'",
|
||||
"torch @ https://download.pytorch.org/whl/cu121/torch-2.4.1%2Bcu121-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",
|
||||
"torch @ https://download.pytorch.org/whl/cu121/torch-2.5.1%2Bcu121-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'",
|
||||
"torch @ https://download.pytorch.org/whl/cu121/torch-2.5.1%2Bcu121-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'",
|
||||
"torch @ https://download.pytorch.org/whl/cu121/torch-2.5.1%2Bcu121-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'",
|
||||
"torch @ https://download.pytorch.org/whl/cu121/torch-2.5.1%2Bcu121-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'",
|
||||
"torch @ https://download.pytorch.org/whl/cu121/torch-2.5.1%2Bcu121-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'",
|
||||
"torch @ https://download.pytorch.org/whl/cu121/torch-2.5.1%2Bcu121-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",
|
||||
|
||||
# Exl2
|
||||
"exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu121.torch2.4.0-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'",
|
||||
"exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu121.torch2.4.0-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'",
|
||||
"exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu121.torch2.4.0-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'",
|
||||
"exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu121.torch2.4.0-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'",
|
||||
"exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu121.torch2.4.0-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'",
|
||||
"exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu121.torch2.4.0-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",
|
||||
"exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7+cu121.torch2.5.0-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'",
|
||||
"exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7+cu121.torch2.5.0-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'",
|
||||
"exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7+cu121.torch2.5.0-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'",
|
||||
"exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7+cu121.torch2.5.0-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'",
|
||||
"exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7+cu121.torch2.5.0-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'",
|
||||
"exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7+cu121.torch2.5.0-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",
|
||||
|
||||
# Windows FA2 from https://github.com/bdashore3/flash-attention/releases
|
||||
"flash_attn @ https://github.com/bdashore3/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4.0cxx11abiFALSE-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'",
|
||||
"flash_attn @ https://github.com/bdashore3/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4.0cxx11abiFALSE-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'",
|
||||
"flash_attn @ https://github.com/bdashore3/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4.0cxx11abiFALSE-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'",
|
||||
"flash_attn @ https://github.com/bdashore3/flash-attention/releases/download/v2.7.0.post2/flash_attn-2.7.0.post2+cu124torch2.5.1cxx11abiFALSE-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'",
|
||||
"flash_attn @ https://github.com/bdashore3/flash-attention/releases/download/v2.7.0.post2/flash_attn-2.7.0.post2+cu124torch2.5.1cxx11abiFALSE-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'",
|
||||
"flash_attn @ https://github.com/bdashore3/flash-attention/releases/download/v2.7.0.post2/flash_attn-2.7.0.post2+cu124torch2.5.1cxx11abiFALSE-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'",
|
||||
|
||||
# Linux FA2 from https://github.com/Dao-AILab/flash-attention/releases
|
||||
"flash_attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4cxx11abiFALSE-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'",
|
||||
"flash_attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4cxx11abiFALSE-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'",
|
||||
"flash_attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",
|
||||
]
|
||||
cu118 = [
|
||||
# Torch
|
||||
"torch @ https://download.pytorch.org/whl/cu118/torch-2.4.1%2Bcu118-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'",
|
||||
"torch @ https://download.pytorch.org/whl/cu118/torch-2.4.1%2Bcu118-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'",
|
||||
"torch @ https://download.pytorch.org/whl/cu118/torch-2.4.1%2Bcu118-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'",
|
||||
"torch @ https://download.pytorch.org/whl/cu118/torch-2.4.1%2Bcu118-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'",
|
||||
"torch @ https://download.pytorch.org/whl/cu118/torch-2.4.1%2Bcu118-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'",
|
||||
"torch @ https://download.pytorch.org/whl/cu118/torch-2.4.1%2Bcu118-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",
|
||||
|
||||
# Exl2
|
||||
"exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu118.torch2.4.0-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'",
|
||||
"exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu118.torch2.4.0-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'",
|
||||
"exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu118.torch2.4.0-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'",
|
||||
"exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu118.torch2.4.0-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'",
|
||||
"exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu118.torch2.4.0-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'",
|
||||
"exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu118.torch2.4.0-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",
|
||||
|
||||
# Linux FA2 from https://github.com/Dao-AILab/flash-attention/releases
|
||||
"flash_attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu118torch2.4cxx11abiFALSE-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'",
|
||||
"flash_attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu118torch2.4cxx11abiFALSE-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'",
|
||||
"flash_attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu118torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",
|
||||
"flash_attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.0.post2/flash_attn-2.7.0.post2+cu12torch2.5cxx11abiFALSE-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'",
|
||||
"flash_attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.0.post2/flash_attn-2.7.0.post2+cu12torch2.5cxx11abiFALSE-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'",
|
||||
"flash_attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.0.post2/flash_attn-2.7.0.post2+cu12torch2.5cxx11abiFALSE-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",
|
||||
]
|
||||
amd = [
|
||||
# Torch triton for ROCm
|
||||
"pytorch_triton_rocm @ https://download.pytorch.org/whl/pytorch_triton_rocm-3.0.0-cp312-cp312-linux_x86_64.whl ; python_version == '3.12'",
|
||||
"pytorch_triton_rocm @ https://download.pytorch.org/whl/pytorch_triton_rocm-3.0.0-cp311-cp311-linux_x86_64.whl ; python_version == '3.11'",
|
||||
"pytorch_triton_rocm @ https://download.pytorch.org/whl/pytorch_triton_rocm-3.0.0-cp310-cp310-linux_x86_64.whl ; python_version == '3.10'",
|
||||
"pytorch_triton_rocm @ https://download.pytorch.org/whl/pytorch_triton_rocm-3.1.0-cp312-cp312-linux_x86_64.whl ; python_version == '3.12'",
|
||||
"pytorch_triton_rocm @ https://download.pytorch.org/whl/pytorch_triton_rocm-3.1.0-cp311-cp311-linux_x86_64.whl ; python_version == '3.11'",
|
||||
"pytorch_triton_rocm @ https://download.pytorch.org/whl/pytorch_triton_rocm-3.1.0-cp310-cp310-linux_x86_64.whl ; python_version == '3.10'",
|
||||
|
||||
# Torch
|
||||
"torch @ https://download.pytorch.org/whl/rocm6.0/torch-2.4.1%2Brocm6.0-cp312-cp312-linux_x86_64.whl ; python_version == '3.12'",
|
||||
"torch @ https://download.pytorch.org/whl/rocm6.0/torch-2.4.1%2Brocm6.0-cp311-cp311-linux_x86_64.whl ; python_version == '3.11'",
|
||||
"torch @ https://download.pytorch.org/whl/rocm6.0/torch-2.4.1%2Brocm6.0-cp310-cp310-linux_x86_64.whl ; python_version == '3.10'",
|
||||
"torch @ https://download.pytorch.org/whl/rocm6.2/torch-2.5.1%2Brocm6.2-cp312-cp312-linux_x86_64.whl ; python_version == '3.12'",
|
||||
"torch @ https://download.pytorch.org/whl/rocm6.2/torch-2.5.1%2Brocm6.2-cp311-cp311-linux_x86_64.whl ; python_version == '3.11'",
|
||||
"torch @ https://download.pytorch.org/whl/rocm6.2/torch-2.5.1%2Brocm6.2-cp310-cp310-linux_x86_64.whl ; python_version == '3.10'",
|
||||
|
||||
# Exl2
|
||||
"exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+rocm6.1.torch2.4.0-cp312-cp312-linux_x86_64.whl ; python_version == '3.12'",
|
||||
"exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+rocm6.1.torch2.4.0-cp311-cp311-linux_x86_64.whl ; python_version == '3.11'",
|
||||
"exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+rocm6.1.torch2.4.0-cp310-cp310-linux_x86_64.whl ; python_version == '3.10'",
|
||||
"exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7+rocm6.2.torch2.5.0-cp312-cp312-linux_x86_64.whl ; python_version == '3.12'",
|
||||
"exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7+rocm6.2.torch2.5.0-cp311-cp311-linux_x86_64.whl ; python_version == '3.11'",
|
||||
"exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7+rocm6.2.torch2.5.0-cp310-cp310-linux_x86_64.whl ; python_version == '3.10'",
|
||||
]
|
||||
|
||||
# MARK: Ruff options
|
||||
|
||||
2
start.py
2
start.py
@@ -47,7 +47,7 @@ def get_install_features(lib_name: str = None):
|
||||
# Ask the user for the GPU lib
|
||||
gpu_lib_choices = {
|
||||
"A": {"pretty": "NVIDIA Cuda 12.x", "internal": "cu121"},
|
||||
"B": {"pretty": "NVIDIA Cuda 11.8", "internal": "cu118"},
|
||||
"B": {"pretty": "NVIDIA Cuda 11.8 (Unsupported)", "internal": "cu118"},
|
||||
"C": {"pretty": "AMD", "internal": "amd"},
|
||||
}
|
||||
user_input = get_user_choice(
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
{%- set start_header = "<|start_header_id|>" -%}
|
||||
{%- set end_header = "<|end_header_id|>\n" -%}
|
||||
|
||||
{%- set example_tool_call -%}[
|
||||
{%- set example_tool_call = '[
|
||||
{
|
||||
"id": "tool_id_1342",
|
||||
"function": {
|
||||
@@ -23,29 +23,26 @@
|
||||
},
|
||||
"type": "function"
|
||||
}
|
||||
]
|
||||
{%- endset -%}
|
||||
]' -%}
|
||||
|
||||
{%- set inital_system_prompt -%}You are an assistant that has access to the following set of tools, to call a tool:
|
||||
1. Prefix calls with '{{ tool_start }}' and end calls with '{{ tool_end }}'
|
||||
{%- set inital_system_prompt = 'You are an assistant that has access to the following set of tools, to call a tool:
|
||||
1. Prefix calls with ' + tool_start + ' and end calls with ' + tool_end + '
|
||||
2. Ensure you use the correct type for arguments. For example, if the argument is a string, ensure it is enclosed in quotes, otherwise, it should not be.
|
||||
3. Generate all calls using the following json tool call format. Here is a multi tool call example:
|
||||
|
||||
{{ tool_start }}{{ example_tool_call }}{{ tool_end }}
|
||||
' + tool_start + example_tool_call + tool_end + '
|
||||
|
||||
Here are the tools available for you to call:
|
||||
{{ tools_json }}
|
||||
{%- endset -%}
|
||||
' + tools_json -%}
|
||||
|
||||
{%- set tool_reminder -%}Available Tools:
|
||||
{{ tools_json }}
|
||||
{%- set tool_reminder = 'Available Tools:
|
||||
' + tools_json + '
|
||||
|
||||
Tool Call Format Example:
|
||||
{{ tool_start }}{{ example_tool_call }}
|
||||
' + tool_start + example_tool_call + '
|
||||
|
||||
Prefix & Suffix: Begin tool calls with {{ tool_start }} and end with {{ tool_end }}.
|
||||
Argument Types: Use correct data types for arguments (e.g., strings in quotes, numbers without).
|
||||
{%- endset -%}
|
||||
Prefix & Suffix: Begin tool calls with ' + tool_start + ' and end with ' + tool_end + '.
|
||||
Argument Types: Use correct data types for arguments (e.g., strings in quotes, numbers without).' -%}
|
||||
|
||||
{# Template #}
|
||||
|
||||
@@ -54,15 +51,15 @@ Argument Types: Use correct data types for arguments (e.g., strings in quotes, n
|
||||
{%- if role not in message_roles -%}
|
||||
{{ raise_exception('Invalid role ' + message['role'] + '. Only ' + message_roles | join(', ') + ' are supported.') }}
|
||||
{%- endif -%}
|
||||
|
||||
{%- set content = message['content'] | default('', true) | trim -%}
|
||||
|
||||
{%- set content = message['content'] if message['content'] is defined else '' | trim -%}
|
||||
{%- if loop.first -%}
|
||||
{{ bos_token }}{{ start_header }}{{ role }}{{ end_header }}
|
||||
{{ inital_system_prompt }}
|
||||
|
||||
{{ content }}{{ eos_token }}
|
||||
{%- endif -%}
|
||||
|
||||
|
||||
{%- if not loop.first -%}
|
||||
{{ start_header }}{{ role }}{{ end_header }}
|
||||
{{ content }}
|
||||
@@ -81,4 +78,4 @@ Argument Types: Use correct data types for arguments (e.g., strings in quotes, n
|
||||
{{ tool_precursor }}{{ tool_start }}
|
||||
{%- else -%}
|
||||
{{ start_header }}assistant{{ end_header }}
|
||||
{%- endif -%}
|
||||
{%- endif -%}
|
||||
|
||||
Reference in New Issue
Block a user