mirror of
https://github.com/theroyallab/tabbyAPI.git
synced 2026-04-29 10:42:03 +00:00
Async: Add option to use Uvloop/Winloop
These are faster event loops for asyncio which should improve overall performance. Gate these under an experimental flag for now to stress test these loops. Signed-off-by: kingbri <bdashore3@proton.me>
This commit is contained in:
@@ -193,7 +193,12 @@ def add_developer_args(parser: argparse.ArgumentParser):
|
|||||||
developer_group.add_argument(
|
developer_group.add_argument(
|
||||||
"--cuda-malloc-backend",
|
"--cuda-malloc-backend",
|
||||||
type=str_to_bool,
|
type=str_to_bool,
|
||||||
help="Disables API request streaming",
|
help="Runs with the pytorch CUDA malloc backend",
|
||||||
|
)
|
||||||
|
developer_group.add_argument(
|
||||||
|
"--uvloop",
|
||||||
|
type=str_to_bool,
|
||||||
|
help="Run asyncio using Uvloop or Winloop",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -62,6 +62,11 @@ developer:
|
|||||||
# This can save a few MBs of VRAM, but has a risk of errors. Use at your own risk.
|
# This can save a few MBs of VRAM, but has a risk of errors. Use at your own risk.
|
||||||
#cuda_malloc_backend: False
|
#cuda_malloc_backend: False
|
||||||
|
|
||||||
|
# Enable Uvloop or Winloop (default: False)
|
||||||
|
# Make the program utilize a faster async event loop which can improve performance
|
||||||
|
# NOTE: It's recommended to enable this, but if something breaks, turn this off.
|
||||||
|
#uvloop: False
|
||||||
|
|
||||||
# Options for model overrides and loading
|
# Options for model overrides and loading
|
||||||
# Please read the comments to understand how arguments are handled between initial and API loads
|
# Please read the comments to understand how arguments are handled between initial and API loads
|
||||||
model:
|
model:
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
import asyncio
|
||||||
import uvicorn
|
import uvicorn
|
||||||
from fastapi import FastAPI
|
from fastapi import FastAPI
|
||||||
from fastapi.middleware.cors import CORSMiddleware
|
from fastapi.middleware.cors import CORSMiddleware
|
||||||
@@ -71,11 +72,15 @@ async def start_api(host: str, port: int):
|
|||||||
# Setup app
|
# Setup app
|
||||||
app = setup_app()
|
app = setup_app()
|
||||||
|
|
||||||
|
# Get the current event loop
|
||||||
|
loop = asyncio.get_running_loop()
|
||||||
|
|
||||||
config = uvicorn.Config(
|
config = uvicorn.Config(
|
||||||
app,
|
app,
|
||||||
host=host,
|
host=host,
|
||||||
port=port,
|
port=port,
|
||||||
log_config=UVICORN_LOG_CONFIG,
|
log_config=UVICORN_LOG_CONFIG,
|
||||||
|
loop=loop,
|
||||||
)
|
)
|
||||||
server = uvicorn.Server(config)
|
server = uvicorn.Server(config)
|
||||||
|
|
||||||
|
|||||||
110
main.py
110
main.py
@@ -1,10 +1,10 @@
|
|||||||
"""The main tabbyAPI module. Contains the FastAPI server and endpoints."""
|
"""The main tabbyAPI module. Contains the FastAPI server and endpoints."""
|
||||||
|
|
||||||
import asyncio
|
import asyncio
|
||||||
import aiofiles
|
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import pathlib
|
import pathlib
|
||||||
|
import platform
|
||||||
import signal
|
import signal
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
@@ -23,51 +23,8 @@ if not do_export_openapi:
|
|||||||
from backends.exllamav2.utils import check_exllama_version
|
from backends.exllamav2.utils import check_exllama_version
|
||||||
|
|
||||||
|
|
||||||
async def entrypoint(args: Optional[dict] = None):
|
async def entrypoint_async():
|
||||||
"""Entry function for program startup"""
|
"""Async entry function for program startup"""
|
||||||
|
|
||||||
setup_logger()
|
|
||||||
|
|
||||||
# Set up signal aborting
|
|
||||||
signal.signal(signal.SIGINT, signal_handler)
|
|
||||||
signal.signal(signal.SIGTERM, signal_handler)
|
|
||||||
|
|
||||||
if os.getenv("EXPORT_OPENAPI", "").lower() in ("true", "1"):
|
|
||||||
openapi_json = export_openapi()
|
|
||||||
|
|
||||||
async with aiofiles.open("openapi.json", "w") as f:
|
|
||||||
await f.write(json.dumps(openapi_json))
|
|
||||||
logger.info("Successfully wrote OpenAPI spec to openapi.json")
|
|
||||||
|
|
||||||
return
|
|
||||||
|
|
||||||
# Load from YAML config
|
|
||||||
config.from_file(pathlib.Path("config.yml"))
|
|
||||||
|
|
||||||
# Parse and override config from args
|
|
||||||
if args is None:
|
|
||||||
parser = init_argparser()
|
|
||||||
args = convert_args_to_dict(parser.parse_args(), parser)
|
|
||||||
|
|
||||||
config.from_args(args)
|
|
||||||
|
|
||||||
developer_config = config.developer_config()
|
|
||||||
|
|
||||||
# Check exllamav2 version and give a descriptive error if it's too old
|
|
||||||
# Skip if launching unsafely
|
|
||||||
|
|
||||||
if unwrap(developer_config.get("unsafe_launch"), False):
|
|
||||||
logger.warning(
|
|
||||||
"UNSAFE: Skipping ExllamaV2 version check.\n"
|
|
||||||
"If you aren't a developer, please keep this off!"
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
check_exllama_version()
|
|
||||||
|
|
||||||
# Enable CUDA malloc backend
|
|
||||||
if unwrap(developer_config.get("cuda_malloc_backend"), False):
|
|
||||||
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "backend:cudaMallocAsync"
|
|
||||||
logger.warning("Enabled the experimental CUDA malloc backend.")
|
|
||||||
|
|
||||||
network_config = config.network_config()
|
network_config = config.network_config()
|
||||||
|
|
||||||
@@ -131,5 +88,64 @@ async def entrypoint(args: Optional[dict] = None):
|
|||||||
await start_api(host, port)
|
await start_api(host, port)
|
||||||
|
|
||||||
|
|
||||||
|
def entrypoint(arguments: Optional[dict] = None):
|
||||||
|
setup_logger()
|
||||||
|
|
||||||
|
# Set up signal aborting
|
||||||
|
signal.signal(signal.SIGINT, signal_handler)
|
||||||
|
signal.signal(signal.SIGTERM, signal_handler)
|
||||||
|
|
||||||
|
if do_export_openapi:
|
||||||
|
openapi_json = export_openapi()
|
||||||
|
|
||||||
|
with open("openapi.json", "w") as f:
|
||||||
|
f.write(json.dumps(openapi_json))
|
||||||
|
logger.info("Successfully wrote OpenAPI spec to openapi.json")
|
||||||
|
|
||||||
|
return
|
||||||
|
|
||||||
|
# Load from YAML config
|
||||||
|
config.from_file(pathlib.Path("config.yml"))
|
||||||
|
|
||||||
|
# Parse and override config from args
|
||||||
|
if arguments is None:
|
||||||
|
parser = init_argparser()
|
||||||
|
arguments = convert_args_to_dict(parser.parse_args(), parser)
|
||||||
|
|
||||||
|
config.from_args(arguments)
|
||||||
|
developer_config = config.developer_config()
|
||||||
|
|
||||||
|
# Check exllamav2 version and give a descriptive error if it's too old
|
||||||
|
# Skip if launching unsafely
|
||||||
|
|
||||||
|
if unwrap(developer_config.get("unsafe_launch"), False):
|
||||||
|
logger.warning(
|
||||||
|
"UNSAFE: Skipping ExllamaV2 version check.\n"
|
||||||
|
"If you aren't a developer, please keep this off!"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
check_exllama_version()
|
||||||
|
|
||||||
|
# Enable CUDA malloc backend
|
||||||
|
if unwrap(developer_config.get("cuda_malloc_backend"), False):
|
||||||
|
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "backend:cudaMallocAsync"
|
||||||
|
logger.warning("EXPERIMENTAL: Enabled the pytorch CUDA malloc backend.")
|
||||||
|
|
||||||
|
# Use Uvloop/Winloop
|
||||||
|
if unwrap(developer_config.get("uvloop"), False):
|
||||||
|
if platform.system() == "Windows":
|
||||||
|
from winloop import install
|
||||||
|
else:
|
||||||
|
from uvloop import install
|
||||||
|
|
||||||
|
# Set loop event policy
|
||||||
|
install()
|
||||||
|
|
||||||
|
logger.warning("EXPERIMENTAL: Running program with Uvloop/Winloop.")
|
||||||
|
|
||||||
|
# Enter into the async event loop
|
||||||
|
asyncio.run(entrypoint_async())
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
asyncio.run(entrypoint())
|
entrypoint()
|
||||||
|
|||||||
@@ -29,6 +29,10 @@ dependencies = [
|
|||||||
"lm-format-enforcer >= 0.9.6",
|
"lm-format-enforcer >= 0.9.6",
|
||||||
"aiofiles",
|
"aiofiles",
|
||||||
|
|
||||||
|
# Improved asyncio loops
|
||||||
|
"uvloop ; platform_system == 'Linux' and platform_machine == 'x86_64'",
|
||||||
|
"winloop ; platform_system == 'Windows'",
|
||||||
|
|
||||||
# TEMP: Remove once 2.x is fixed in upstream
|
# TEMP: Remove once 2.x is fixed in upstream
|
||||||
"numpy < 2.0.0",
|
"numpy < 2.0.0",
|
||||||
|
|
||||||
|
|||||||
4
start.py
4
start.py
@@ -1,6 +1,5 @@
|
|||||||
"""Utility to automatically upgrade and start the API"""
|
"""Utility to automatically upgrade and start the API"""
|
||||||
|
|
||||||
import asyncio
|
|
||||||
import argparse
|
import argparse
|
||||||
import os
|
import os
|
||||||
import pathlib
|
import pathlib
|
||||||
@@ -159,4 +158,5 @@ if __name__ == "__main__":
|
|||||||
# Import entrypoint after installing all requirements
|
# Import entrypoint after installing all requirements
|
||||||
from main import entrypoint
|
from main import entrypoint
|
||||||
|
|
||||||
asyncio.run(entrypoint(convert_args_to_dict(args, parser)))
|
converted_args = convert_args_to_dict(args, parser)
|
||||||
|
entrypoint(converted_args)
|
||||||
|
|||||||
Reference in New Issue
Block a user