Async: Add option to use Uvloop/Winloop

These are faster event loops for asyncio which should improve overall
performance. Gate these under an experimental flag for now to stress
test these loops.

Signed-off-by: kingbri <bdashore3@proton.me>
This commit is contained in:
kingbri
2024-07-24 18:56:28 -04:00
parent 71de3060bb
commit 5c082b7e8c
6 changed files with 85 additions and 50 deletions

View File

@@ -193,7 +193,12 @@ def add_developer_args(parser: argparse.ArgumentParser):
developer_group.add_argument( developer_group.add_argument(
"--cuda-malloc-backend", "--cuda-malloc-backend",
type=str_to_bool, type=str_to_bool,
help="Disables API request streaming", help="Runs with the pytorch CUDA malloc backend",
)
developer_group.add_argument(
"--uvloop",
type=str_to_bool,
help="Run asyncio using Uvloop or Winloop",
) )

View File

@@ -62,6 +62,11 @@ developer:
# This can save a few MBs of VRAM, but has a risk of errors. Use at your own risk. # This can save a few MBs of VRAM, but has a risk of errors. Use at your own risk.
#cuda_malloc_backend: False #cuda_malloc_backend: False
# Enable Uvloop or Winloop (default: False)
# Make the program utilize a faster async event loop which can improve performance
# NOTE: It's recommended to enable this, but if something breaks, turn this off.
#uvloop: False
# Options for model overrides and loading # Options for model overrides and loading
# Please read the comments to understand how arguments are handled between initial and API loads # Please read the comments to understand how arguments are handled between initial and API loads
model: model:

View File

@@ -1,3 +1,4 @@
import asyncio
import uvicorn import uvicorn
from fastapi import FastAPI from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware from fastapi.middleware.cors import CORSMiddleware
@@ -71,11 +72,15 @@ async def start_api(host: str, port: int):
# Setup app # Setup app
app = setup_app() app = setup_app()
# Get the current event loop
loop = asyncio.get_running_loop()
config = uvicorn.Config( config = uvicorn.Config(
app, app,
host=host, host=host,
port=port, port=port,
log_config=UVICORN_LOG_CONFIG, log_config=UVICORN_LOG_CONFIG,
loop=loop,
) )
server = uvicorn.Server(config) server = uvicorn.Server(config)

110
main.py
View File

@@ -1,10 +1,10 @@
"""The main tabbyAPI module. Contains the FastAPI server and endpoints.""" """The main tabbyAPI module. Contains the FastAPI server and endpoints."""
import asyncio import asyncio
import aiofiles
import json import json
import os import os
import pathlib import pathlib
import platform
import signal import signal
from loguru import logger from loguru import logger
from typing import Optional from typing import Optional
@@ -23,51 +23,8 @@ if not do_export_openapi:
from backends.exllamav2.utils import check_exllama_version from backends.exllamav2.utils import check_exllama_version
async def entrypoint(args: Optional[dict] = None): async def entrypoint_async():
"""Entry function for program startup""" """Async entry function for program startup"""
setup_logger()
# Set up signal aborting
signal.signal(signal.SIGINT, signal_handler)
signal.signal(signal.SIGTERM, signal_handler)
if os.getenv("EXPORT_OPENAPI", "").lower() in ("true", "1"):
openapi_json = export_openapi()
async with aiofiles.open("openapi.json", "w") as f:
await f.write(json.dumps(openapi_json))
logger.info("Successfully wrote OpenAPI spec to openapi.json")
return
# Load from YAML config
config.from_file(pathlib.Path("config.yml"))
# Parse and override config from args
if args is None:
parser = init_argparser()
args = convert_args_to_dict(parser.parse_args(), parser)
config.from_args(args)
developer_config = config.developer_config()
# Check exllamav2 version and give a descriptive error if it's too old
# Skip if launching unsafely
if unwrap(developer_config.get("unsafe_launch"), False):
logger.warning(
"UNSAFE: Skipping ExllamaV2 version check.\n"
"If you aren't a developer, please keep this off!"
)
else:
check_exllama_version()
# Enable CUDA malloc backend
if unwrap(developer_config.get("cuda_malloc_backend"), False):
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "backend:cudaMallocAsync"
logger.warning("Enabled the experimental CUDA malloc backend.")
network_config = config.network_config() network_config = config.network_config()
@@ -131,5 +88,64 @@ async def entrypoint(args: Optional[dict] = None):
await start_api(host, port) await start_api(host, port)
def entrypoint(arguments: Optional[dict] = None):
setup_logger()
# Set up signal aborting
signal.signal(signal.SIGINT, signal_handler)
signal.signal(signal.SIGTERM, signal_handler)
if do_export_openapi:
openapi_json = export_openapi()
with open("openapi.json", "w") as f:
f.write(json.dumps(openapi_json))
logger.info("Successfully wrote OpenAPI spec to openapi.json")
return
# Load from YAML config
config.from_file(pathlib.Path("config.yml"))
# Parse and override config from args
if arguments is None:
parser = init_argparser()
arguments = convert_args_to_dict(parser.parse_args(), parser)
config.from_args(arguments)
developer_config = config.developer_config()
# Check exllamav2 version and give a descriptive error if it's too old
# Skip if launching unsafely
if unwrap(developer_config.get("unsafe_launch"), False):
logger.warning(
"UNSAFE: Skipping ExllamaV2 version check.\n"
"If you aren't a developer, please keep this off!"
)
else:
check_exllama_version()
# Enable CUDA malloc backend
if unwrap(developer_config.get("cuda_malloc_backend"), False):
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "backend:cudaMallocAsync"
logger.warning("EXPERIMENTAL: Enabled the pytorch CUDA malloc backend.")
# Use Uvloop/Winloop
if unwrap(developer_config.get("uvloop"), False):
if platform.system() == "Windows":
from winloop import install
else:
from uvloop import install
# Set loop event policy
install()
logger.warning("EXPERIMENTAL: Running program with Uvloop/Winloop.")
# Enter into the async event loop
asyncio.run(entrypoint_async())
if __name__ == "__main__": if __name__ == "__main__":
asyncio.run(entrypoint()) entrypoint()

View File

@@ -29,6 +29,10 @@ dependencies = [
"lm-format-enforcer >= 0.9.6", "lm-format-enforcer >= 0.9.6",
"aiofiles", "aiofiles",
# Improved asyncio loops
"uvloop ; platform_system == 'Linux' and platform_machine == 'x86_64'",
"winloop ; platform_system == 'Windows'",
# TEMP: Remove once 2.x is fixed in upstream # TEMP: Remove once 2.x is fixed in upstream
"numpy < 2.0.0", "numpy < 2.0.0",

View File

@@ -1,6 +1,5 @@
"""Utility to automatically upgrade and start the API""" """Utility to automatically upgrade and start the API"""
import asyncio
import argparse import argparse
import os import os
import pathlib import pathlib
@@ -159,4 +158,5 @@ if __name__ == "__main__":
# Import entrypoint after installing all requirements # Import entrypoint after installing all requirements
from main import entrypoint from main import entrypoint
asyncio.run(entrypoint(convert_args_to_dict(args, parser))) converted_args = convert_args_to_dict(args, parser)
entrypoint(converted_args)