mirror of
https://github.com/theroyallab/tabbyAPI.git
synced 2026-03-15 00:07:28 +00:00
OAI: Initial vision support in OAI chat completions
* Support image_url inputs containing URLs or base64 strings following OAI vision spec * Use async lru cache for image embeddings * Add generic wrapper class for multimodal embeddings
This commit is contained in:
36
common/multimodal.py
Normal file
36
common/multimodal.py
Normal file
@@ -0,0 +1,36 @@
|
||||
from typing import List
|
||||
from backends.exllamav2.vision import get_image_embedding
|
||||
from common import model
|
||||
from pydantic import BaseModel
|
||||
from loguru import logger
|
||||
|
||||
from common.optional_dependencies import dependencies
|
||||
|
||||
if dependencies.exllamav2:
|
||||
from exllamav2 import ExLlamaV2VisionTower
|
||||
|
||||
|
||||
class MultimodalEmbeddingWrapper(BaseModel):
|
||||
"""Common multimodal embedding wrapper"""
|
||||
|
||||
type: str = None
|
||||
content: List = []
|
||||
text_alias: List[str] = []
|
||||
|
||||
|
||||
async def add_image_embedding(
|
||||
embeddings: MultimodalEmbeddingWrapper, url: str
|
||||
) -> MultimodalEmbeddingWrapper:
|
||||
# Determine the type of vision embedding to use
|
||||
if not embeddings.type:
|
||||
if isinstance(model.container.vision_model, ExLlamaV2VisionTower):
|
||||
embeddings.type = "ExLlamaV2MMEmbedding"
|
||||
|
||||
if embeddings.type == "ExLlamaV2MMEmbedding":
|
||||
embedding = await get_image_embedding(url)
|
||||
embeddings.content.append(embedding)
|
||||
embeddings.text_alias.append(embedding.text_alias)
|
||||
else:
|
||||
logger.error("No valid vision model to create embedding")
|
||||
|
||||
return embeddings
|
||||
Reference in New Issue
Block a user