mirror of
https://github.com/theroyallab/tabbyAPI.git
synced 2026-04-26 17:28:54 +00:00
Model: Add logprobs support
Returns token offsets, selected tokens, probabilities of tokens post-sampling, and normalized probability of selecting a token pre-sampling (for efficiency purposes). Only for text completions. Chat completions in a later commit. Signed-off-by: kingbri <bdashore3@proton.me>
This commit is contained in:
16
main.py
16
main.py
@@ -458,12 +458,13 @@ async def generate_completion(request: Request, data: CompletionRequest):
|
||||
new_generation = MODEL_CONTAINER.generate_gen(
|
||||
data.prompt, **data.to_gen_params()
|
||||
)
|
||||
for part, prompt_tokens, completion_tokens in new_generation:
|
||||
for generation in new_generation:
|
||||
if await request.is_disconnected():
|
||||
break
|
||||
|
||||
response = create_completion_response(
|
||||
part, prompt_tokens, completion_tokens, model_path.name
|
||||
**generation,
|
||||
model_name=model_path.name,
|
||||
)
|
||||
|
||||
yield get_sse_packet(response.model_dump_json())
|
||||
@@ -479,13 +480,10 @@ async def generate_completion(request: Request, data: CompletionRequest):
|
||||
generate_with_semaphore(generator), media_type="text/event-stream"
|
||||
)
|
||||
|
||||
response_text, prompt_tokens, completion_tokens = await call_with_semaphore(
|
||||
generation = await call_with_semaphore(
|
||||
partial(MODEL_CONTAINER.generate, data.prompt, **data.to_gen_params())
|
||||
)
|
||||
|
||||
response = create_completion_response(
|
||||
response_text, prompt_tokens, completion_tokens, model_path.name
|
||||
)
|
||||
response = create_completion_response(**generation)
|
||||
|
||||
return response
|
||||
|
||||
@@ -545,12 +543,12 @@ async def generate_chat_completion(request: Request, data: ChatCompletionRequest
|
||||
new_generation = MODEL_CONTAINER.generate_gen(
|
||||
prompt, **data.to_gen_params()
|
||||
)
|
||||
for part, _, _ in new_generation:
|
||||
for generation in new_generation:
|
||||
if await request.is_disconnected():
|
||||
break
|
||||
|
||||
response = create_chat_completion_stream_chunk(
|
||||
const_id, part, model_path.name
|
||||
const_id, generation.get("chunk"), model_path.name
|
||||
)
|
||||
|
||||
yield get_sse_packet(response.model_dump_json())
|
||||
|
||||
Reference in New Issue
Block a user