mirror of
https://github.com/kvcache-ai/sglang.git
synced 2026-04-20 22:39:01 +00:00
refactor: replace local proto compilation with smg-grpc-proto package (#18682)
This commit is contained in:
16
.github/workflows/lint.yml
vendored
16
.github/workflows/lint.yml
vendored
@@ -32,19 +32,3 @@ jobs:
|
||||
extensions: h,c,cpp,hpp,cu,cuh,cc
|
||||
clangFormatVersion: 18
|
||||
style: file
|
||||
|
||||
- name: Check proto files are in sync
|
||||
run: |
|
||||
if ! diff -q python/sglang/srt/grpc/sglang_scheduler.proto sgl-model-gateway/src/proto/sglang_scheduler.proto; then
|
||||
echo "❌ ERROR: Proto files are out of sync!"
|
||||
echo ""
|
||||
echo "The following files must be kept identical:"
|
||||
echo " - python/sglang/srt/grpc/sglang_scheduler.proto"
|
||||
echo " - sgl-model-gateway/src/proto/sglang_scheduler.proto"
|
||||
echo ""
|
||||
echo "Please ensure both files have the same content."
|
||||
echo ""
|
||||
echo "Differences:"
|
||||
diff python/sglang/srt/grpc/sglang_scheduler.proto sgl-model-gateway/src/proto/sglang_scheduler.proto || true
|
||||
exit 1
|
||||
fi
|
||||
|
||||
4
.gitignore
vendored
4
.gitignore
vendored
@@ -262,10 +262,6 @@ outputs/
|
||||
# setuptools-scm generated version file
|
||||
python/sglang/_version.py
|
||||
|
||||
# Generated protobuf files (regenerate during wheel build or with compile_proto.py)
|
||||
python/sglang/srt/grpc/*_pb2.py
|
||||
python/sglang/srt/grpc/*_pb2_grpc.py
|
||||
python/sglang/srt/grpc/*_pb2.pyi
|
||||
|
||||
# MUSA section
|
||||
# Generated source files by torchada
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
[build-system]
|
||||
requires = ["setuptools>=61.0", "setuptools-scm>=8.0", "wheel", "grpcio-tools==1.75.1"]
|
||||
requires = ["setuptools>=61.0", "setuptools-scm>=8.0", "wheel"]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
||||
[project]
|
||||
@@ -76,10 +76,10 @@ dependencies = [
|
||||
"uvloop",
|
||||
"xgrammar==0.1.27",
|
||||
|
||||
"grpcio==1.75.1", # keep it align with compile_proto.py
|
||||
"grpcio-tools==1.75.1", # keep it align with compile_proto.py
|
||||
"grpcio-reflection==1.75.1", # required by srt/entrypoints/grpc_server.py
|
||||
"grpcio-health-checking==1.75.1", # required for Kubernetes gRPC health probes
|
||||
"smg-grpc-proto>=0.3.3",
|
||||
"grpcio>=1.78.0",
|
||||
"grpcio-reflection>=1.78.0",
|
||||
"grpcio-health-checking>=1.78.0",
|
||||
]
|
||||
|
||||
[[tool.uv.index]]
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# https://docs.sglang.io/platforms/cpu_server.html
|
||||
[build-system]
|
||||
requires = ["setuptools>=61.0", "setuptools-scm>=8.0", "wheel", "grpcio-tools==1.75.1"]
|
||||
requires = ["setuptools>=61.0", "setuptools-scm>=8.0", "wheel"]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
||||
[project]
|
||||
@@ -68,9 +68,9 @@ dependencies = [
|
||||
"uvicorn",
|
||||
"uvloop",
|
||||
"xgrammar==0.1.27",
|
||||
"grpcio==1.75.1", # keep it align with compile_proto.py
|
||||
"grpcio-tools==1.75.1", # keep it align with compile_proto.py
|
||||
"grpcio-reflection==1.75.1", # required by srt/entrypoints/grpc_server.py
|
||||
"smg-grpc-proto>=0.3.3",
|
||||
"grpcio>=1.78.0",
|
||||
"grpcio-reflection>=1.78.0",
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
[build-system]
|
||||
requires = ["setuptools>=61.0", "setuptools-scm>=8.0", "wheel", "grpcio-tools==1.75.1"]
|
||||
requires = ["setuptools>=61.0", "setuptools-scm>=8.0", "wheel"]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
||||
[project]
|
||||
@@ -61,9 +61,9 @@ dependencies = [
|
||||
"uvicorn",
|
||||
"uvloop",
|
||||
"xgrammar==0.1.27",
|
||||
"grpcio==1.75.1", # keep it align with compile_proto.py
|
||||
"grpcio-tools==1.75.1", # keep it align with compile_proto.py
|
||||
"grpcio-reflection==1.75.1", # required by srt/entrypoints/grpc_server.py
|
||||
"smg-grpc-proto>=0.3.3",
|
||||
"grpcio>=1.78.0",
|
||||
"grpcio-reflection>=1.78.0",
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
[build-system]
|
||||
requires = ["setuptools>=61.0", "setuptools-scm>=8.0", "wheel", "grpcio-tools==1.75.1"]
|
||||
requires = ["setuptools>=61.0", "setuptools-scm>=8.0", "wheel"]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
||||
[project]
|
||||
@@ -63,9 +63,9 @@ runtime_common = [
|
||||
"uvicorn",
|
||||
"uvloop",
|
||||
"xgrammar==0.1.27",
|
||||
"grpcio==1.75.1", # keep it align with compile_proto.py
|
||||
"grpcio-tools==1.75.1", # keep it align with compile_proto.py
|
||||
"grpcio-reflection==1.75.1", # required by srt/entrypoints/grpc_server.py
|
||||
"smg-grpc-proto>=0.3.3",
|
||||
"grpcio>=1.78.0",
|
||||
"grpcio-reflection>=1.78.0",
|
||||
]
|
||||
|
||||
tracing = [
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
[build-system]
|
||||
requires = ["setuptools>=61.0", "setuptools-scm>=8.0", "wheel", "grpcio-tools==1.75.1"]
|
||||
requires = ["setuptools>=61.0", "setuptools-scm>=8.0", "wheel"]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
||||
[project]
|
||||
@@ -67,9 +67,9 @@ dependencies = [
|
||||
"uvicorn",
|
||||
"uvloop",
|
||||
# "xgrammar==0.1.24", , xgrammar depends on CUDA PyTorch and Triton only
|
||||
"grpcio==1.75.1", # keep it align with compile_proto.py
|
||||
"grpcio-tools==1.75.1", # keep it align with compile_proto.py
|
||||
"grpcio-reflection==1.75.1", # required by srt/entrypoints/grpc_server.py
|
||||
"smg-grpc-proto>=0.3.3",
|
||||
"grpcio>=1.78.0",
|
||||
"grpcio-reflection>=1.78.0",
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
|
||||
125
python/setup.py
125
python/setup.py
@@ -1,125 +0,0 @@
|
||||
"""
|
||||
Custom setup.py for SGLang that compiles protobuf files during build.
|
||||
|
||||
This file works alongside pyproject.toml. It hooks into the build process
|
||||
to automatically generate gRPC/protobuf Python files from .proto sources
|
||||
when building the wheel or doing editable installs.
|
||||
"""
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
from setuptools import setup
|
||||
from setuptools.command.build_py import build_py
|
||||
from setuptools.command.develop import develop
|
||||
from setuptools.command.egg_info import egg_info
|
||||
from setuptools.errors import SetupError
|
||||
|
||||
PROTO_SOURCE = "sglang/srt/grpc/sglang_scheduler.proto"
|
||||
|
||||
|
||||
def compile_proto():
|
||||
"""Compile the protobuf file to Python using grpc_tools.protoc."""
|
||||
proto_path = Path(__file__).parent / PROTO_SOURCE
|
||||
|
||||
if not proto_path.exists():
|
||||
print(f"Warning: Proto file not found at {proto_path}, skipping generation")
|
||||
return
|
||||
|
||||
print(f"Generating gRPC files from {PROTO_SOURCE}")
|
||||
|
||||
output_dir = proto_path.parent
|
||||
proto_dir = proto_path.parent
|
||||
|
||||
# Import grpc_tools.protoc directly instead of running as subprocess.
|
||||
# This ensures we use the grpcio-tools installed in the build environment,
|
||||
# since sys.executable may point to the main Python interpreter in
|
||||
# pip's isolated build environments.
|
||||
try:
|
||||
import grpc_tools
|
||||
from grpc_tools import protoc
|
||||
except ImportError as e:
|
||||
raise SetupError(
|
||||
f"Failed to import grpc_tools: {e}. "
|
||||
"Ensure grpcio-tools is listed in build-system.requires in pyproject.toml"
|
||||
)
|
||||
|
||||
# Get the path to well-known proto files bundled with grpcio-tools
|
||||
# (e.g., google/protobuf/timestamp.proto, google/protobuf/struct.proto)
|
||||
grpc_tools_proto_path = Path(grpc_tools.__file__).parent / "_proto"
|
||||
|
||||
# Build the protoc arguments (protoc.main expects argv-style list)
|
||||
args = [
|
||||
"protoc", # argv[0] is the program name
|
||||
f"-I{proto_dir}",
|
||||
f"-I{grpc_tools_proto_path}", # Include path for well-known protos
|
||||
f"--python_out={output_dir}",
|
||||
f"--grpc_python_out={output_dir}",
|
||||
f"--pyi_out={output_dir}",
|
||||
str(proto_dir / proto_path.name),
|
||||
]
|
||||
|
||||
print(f"Running protoc with args: {args[1:]}")
|
||||
|
||||
# Save and restore cwd since protoc may change it
|
||||
original_cwd = os.getcwd()
|
||||
try:
|
||||
result = protoc.main(args)
|
||||
if result != 0:
|
||||
raise SetupError(f"protoc failed with exit code {result}")
|
||||
finally:
|
||||
os.chdir(original_cwd)
|
||||
|
||||
# Fix imports in generated grpc file (change absolute to relative imports)
|
||||
_fix_imports(output_dir, proto_path.stem)
|
||||
|
||||
print(f"Successfully generated gRPC files in {output_dir}")
|
||||
|
||||
|
||||
def _fix_imports(output_dir: Path, proto_stem: str):
|
||||
"""Fix imports in generated files to use relative imports."""
|
||||
grpc_file = output_dir / f"{proto_stem}_pb2_grpc.py"
|
||||
|
||||
if grpc_file.exists():
|
||||
content = grpc_file.read_text()
|
||||
# Change absolute import to relative import
|
||||
old_import = f"import {proto_stem}_pb2"
|
||||
new_import = f"from . import {proto_stem}_pb2"
|
||||
|
||||
if old_import in content:
|
||||
content = content.replace(old_import, new_import)
|
||||
grpc_file.write_text(content)
|
||||
print("Fixed imports in generated gRPC file")
|
||||
|
||||
|
||||
class BuildPyWithProto(build_py):
|
||||
"""Build Python modules, generating gRPC files from .proto sources first."""
|
||||
|
||||
def run(self):
|
||||
compile_proto()
|
||||
super().run()
|
||||
|
||||
|
||||
class DevelopWithProto(develop):
|
||||
"""Editable install with gRPC file generation."""
|
||||
|
||||
def run(self):
|
||||
compile_proto()
|
||||
super().run()
|
||||
|
||||
|
||||
class EggInfoWithProto(egg_info):
|
||||
"""Egg info generation with gRPC file generation."""
|
||||
|
||||
def run(self):
|
||||
compile_proto()
|
||||
super().run()
|
||||
|
||||
|
||||
setup(
|
||||
cmdclass={
|
||||
"build_py": BuildPyWithProto,
|
||||
"develop": DevelopWithProto,
|
||||
"egg_info": EggInfoWithProto,
|
||||
},
|
||||
)
|
||||
@@ -21,11 +21,11 @@ from google.protobuf.struct_pb2 import Struct
|
||||
from google.protobuf.timestamp_pb2 import Timestamp
|
||||
from grpc_health.v1 import health_pb2_grpc
|
||||
from grpc_reflection.v1alpha import reflection
|
||||
from smg_grpc_proto import sglang_scheduler_pb2, sglang_scheduler_pb2_grpc
|
||||
|
||||
import sglang
|
||||
from sglang.srt.configs.model_config import ModelConfig
|
||||
from sglang.srt.disaggregation.utils import FAKE_BOOTSTRAP_HOST, DisaggregationMode
|
||||
from sglang.srt.grpc import sglang_scheduler_pb2, sglang_scheduler_pb2_grpc
|
||||
from sglang.srt.grpc.grpc_request_manager import GrpcRequestManager
|
||||
from sglang.srt.grpc.health_servicer import SGLangHealthServicer
|
||||
from sglang.srt.grpc.scheduler_launcher import launch_scheduler_process_only
|
||||
|
||||
@@ -1,248 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Compile protobuf files for SGLang gRPC server.
|
||||
|
||||
This script compiles .proto files to Python code using grpc_tools.protoc.
|
||||
It generates:
|
||||
- *_pb2.py (protobuf message classes)
|
||||
- *_pb2_grpc.py (gRPC service classes)
|
||||
- *_pb2.pyi (type hints for mypy/IDEs)
|
||||
|
||||
Usage:
|
||||
python compile_proto.py [--check] [--proto-file PROTO_FILE]
|
||||
|
||||
Options:
|
||||
--check Check if regeneration is needed (exit 1 if needed)
|
||||
--proto-file Specify proto file (default: sglang_scheduler.proto)
|
||||
|
||||
### Install Dependencies
|
||||
pip install "grpcio==1.75.1" "grpcio-tools==1.75.1"
|
||||
|
||||
Please make sure to use the same version of grpcio and grpcio-tools specified in pyproject.toml
|
||||
otherwise update the versions specified in pyproject.toml
|
||||
|
||||
### Run Script
|
||||
cd python/sglang/srt/grpc
|
||||
python compile_proto.py
|
||||
"""
|
||||
|
||||
|
||||
import argparse
|
||||
import subprocess
|
||||
import sys
|
||||
from importlib.metadata import version
|
||||
from pathlib import Path
|
||||
|
||||
GRPC_VERSION = "1.75.1"
|
||||
|
||||
|
||||
def get_file_mtime(path: Path) -> float:
|
||||
"""Get file modification time, return 0 if file doesn't exist."""
|
||||
try:
|
||||
return path.stat().st_mtime
|
||||
except FileNotFoundError:
|
||||
return 0.0
|
||||
|
||||
|
||||
def check_regeneration_needed(proto_file: Path, output_dir: Path) -> bool:
|
||||
"""Check if proto files are newer than generated files."""
|
||||
proto_mtime = get_file_mtime(proto_file)
|
||||
|
||||
generated_files = [
|
||||
output_dir / f"{proto_file.stem}_pb2.py",
|
||||
output_dir / f"{proto_file.stem}_pb2_grpc.py",
|
||||
output_dir / f"{proto_file.stem}_pb2.pyi",
|
||||
]
|
||||
|
||||
for gen_file in generated_files:
|
||||
if get_file_mtime(gen_file) < proto_mtime:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def compile_proto(proto_file: Path, output_dir: Path, verbose: bool = True) -> bool:
|
||||
"""Compile the protobuf file to Python."""
|
||||
|
||||
if not proto_file.exists():
|
||||
print(f"Error: Proto file not found: {proto_file}")
|
||||
return False
|
||||
|
||||
if verbose:
|
||||
print(f"Found proto file: {proto_file}")
|
||||
|
||||
# Check if grpc_tools is available
|
||||
try:
|
||||
import grpc_tools.protoc # noqa: F401
|
||||
except ImportError:
|
||||
print("Error: grpcio-tools not installed")
|
||||
print(
|
||||
f'Install with: pip install "grpcio-tools=={GRPC_VERSION}" "grpcio=={GRPC_VERSION}"'
|
||||
)
|
||||
return False
|
||||
|
||||
grpc_tools_version = version("grpcio-tools")
|
||||
grpc_version = version("grpcio")
|
||||
if grpc_tools_version != GRPC_VERSION or grpc_version != GRPC_VERSION:
|
||||
raise RuntimeError(
|
||||
f"Error: grpcio-tools version {grpc_tools_version} and grpcio version {grpc_version} detected, but {GRPC_VERSION} is required."
|
||||
)
|
||||
|
||||
# Compile command
|
||||
cmd = [
|
||||
sys.executable,
|
||||
"-m",
|
||||
"grpc_tools.protoc",
|
||||
f"-I{proto_file.parent}",
|
||||
f"--python_out={output_dir}",
|
||||
f"--grpc_python_out={output_dir}",
|
||||
f"--pyi_out={output_dir}", # Generate type stubs
|
||||
str(proto_file.name),
|
||||
]
|
||||
|
||||
if verbose:
|
||||
print(f"Running: {' '.join(cmd)}")
|
||||
|
||||
# Run protoc
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, cwd=proto_file.parent)
|
||||
|
||||
if result.returncode != 0:
|
||||
print(f"Error compiling proto:")
|
||||
print(result.stderr)
|
||||
if result.stdout:
|
||||
print(result.stdout)
|
||||
return False
|
||||
|
||||
# Verify generated files exist
|
||||
generated_files = [
|
||||
f"{proto_file.stem}_pb2.py",
|
||||
f"{proto_file.stem}_pb2_grpc.py",
|
||||
f"{proto_file.stem}_pb2.pyi",
|
||||
]
|
||||
|
||||
missing_files = []
|
||||
for gen_file in generated_files:
|
||||
if not (output_dir / gen_file).exists():
|
||||
missing_files.append(gen_file)
|
||||
|
||||
if missing_files:
|
||||
print(f"Error: Expected generated files not found: {missing_files}")
|
||||
return False
|
||||
|
||||
if verbose:
|
||||
print("Successfully compiled protobuf files:")
|
||||
for gen_file in generated_files:
|
||||
print(f" - {output_dir}/{gen_file}")
|
||||
|
||||
# Fix imports in generated files
|
||||
fix_imports(output_dir, proto_file.stem, verbose)
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def fix_imports(output_dir: Path, proto_stem: str, verbose: bool = True) -> None:
|
||||
"""Fix imports in generated files to use relative imports."""
|
||||
grpc_file = output_dir / f"{proto_stem}_pb2_grpc.py"
|
||||
|
||||
if grpc_file.exists():
|
||||
content = grpc_file.read_text()
|
||||
# Change absolute import to relative import
|
||||
old_import = f"import {proto_stem}_pb2"
|
||||
new_import = f"from . import {proto_stem}_pb2"
|
||||
|
||||
if old_import in content:
|
||||
content = content.replace(old_import, new_import)
|
||||
grpc_file.write_text(content)
|
||||
if verbose:
|
||||
print("Fixed imports in generated files")
|
||||
|
||||
|
||||
def add_generation_header(output_dir: Path, proto_stem: str) -> None:
|
||||
"""Add header to generated files indicating they are auto-generated."""
|
||||
header = """# This file is auto-generated. Do not edit manually.
|
||||
# Regenerate with: python compile_proto.py
|
||||
|
||||
"""
|
||||
|
||||
files_to_update = [f"{proto_stem}_pb2.py", f"{proto_stem}_pb2_grpc.py"]
|
||||
|
||||
for filename in files_to_update:
|
||||
file_path = output_dir / filename
|
||||
if file_path.exists():
|
||||
content = file_path.read_text()
|
||||
if not content.startswith("# This file is auto-generated"):
|
||||
file_path.write_text(header + content)
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Compile protobuf files for SGLang gRPC server",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog=__doc__,
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--check",
|
||||
action="store_true",
|
||||
help="Check if regeneration is needed (exit 1 if needed)",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--proto-file",
|
||||
type=str,
|
||||
default="sglang_scheduler.proto",
|
||||
help="Proto file to compile (default: sglang_scheduler.proto)",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-v",
|
||||
"--verbose",
|
||||
action="store_true",
|
||||
default=True,
|
||||
help="Verbose output (default: True)",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-q", "--quiet", action="store_true", help="Quiet mode (overrides verbose)"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Handle verbosity
|
||||
verbose = args.verbose and not args.quiet
|
||||
|
||||
# Get paths
|
||||
script_dir = Path(__file__).parent
|
||||
proto_file = script_dir / args.proto_file
|
||||
output_dir = script_dir
|
||||
|
||||
# Check mode
|
||||
if args.check:
|
||||
if check_regeneration_needed(proto_file, output_dir):
|
||||
if verbose:
|
||||
print("Proto files need regeneration")
|
||||
sys.exit(1)
|
||||
else:
|
||||
if verbose:
|
||||
print("Generated files are up to date")
|
||||
sys.exit(0)
|
||||
|
||||
# Compile mode
|
||||
success = compile_proto(proto_file, output_dir, verbose)
|
||||
|
||||
if success:
|
||||
# Add generation headers
|
||||
add_generation_header(output_dir, proto_file.stem)
|
||||
|
||||
if verbose:
|
||||
print("\n✅ Protobuf compilation successful!")
|
||||
print("Generated files are ready for use")
|
||||
else:
|
||||
if verbose:
|
||||
print("\n❌ Protobuf compilation failed!")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,566 +0,0 @@
|
||||
syntax = "proto3";
|
||||
|
||||
package sglang.grpc.scheduler;
|
||||
|
||||
import "google/protobuf/timestamp.proto";
|
||||
import "google/protobuf/struct.proto";
|
||||
|
||||
// Service definition for SGLang scheduler communication
|
||||
// This protocol bridges the Rust router and Python scheduler
|
||||
service SglangScheduler {
|
||||
// Submit a generation request (supports streaming)
|
||||
rpc Generate(GenerateRequest) returns (stream GenerateResponse);
|
||||
|
||||
// Submit an embedding request
|
||||
rpc Embed(EmbedRequest) returns (EmbedResponse);
|
||||
|
||||
// Health check and metrics
|
||||
rpc HealthCheck(HealthCheckRequest) returns (HealthCheckResponse);
|
||||
|
||||
// Abort a running request
|
||||
rpc Abort(AbortRequest) returns (AbortResponse);
|
||||
|
||||
// Get model information
|
||||
rpc GetModelInfo(GetModelInfoRequest) returns (GetModelInfoResponse);
|
||||
|
||||
// Get server information
|
||||
rpc GetServerInfo(GetServerInfoRequest) returns (GetServerInfoResponse);
|
||||
|
||||
// Get comprehensive load metrics
|
||||
rpc GetLoads(GetLoadsRequest) returns (GetLoadsResponse);
|
||||
|
||||
}
|
||||
|
||||
// =====================
|
||||
// Common Types
|
||||
// =====================
|
||||
|
||||
// Sampling parameters matching SGLang's SamplingParams
|
||||
//
|
||||
// IMPORTANT: Do not use SamplingParams::default() directly!
|
||||
// The proto3 defaults (0 for numeric fields) do NOT match the semantic defaults
|
||||
// (temperature=1.0, top_p=1.0, top_k=-1, etc.). Always construct with explicit values
|
||||
// or use the conversion functions in sglang_scheduler.rs / grpc_server.py.
|
||||
message SamplingParams {
|
||||
float temperature = 1;
|
||||
float top_p = 2;
|
||||
int32 top_k = 3;
|
||||
float min_p = 4;
|
||||
float frequency_penalty = 5;
|
||||
float presence_penalty = 6;
|
||||
float repetition_penalty = 7;
|
||||
|
||||
optional int32 max_new_tokens = 8;
|
||||
repeated string stop = 9;
|
||||
repeated uint32 stop_token_ids = 10;
|
||||
bool skip_special_tokens = 11;
|
||||
bool spaces_between_special_tokens = 12;
|
||||
|
||||
// Structured generation
|
||||
oneof constraint {
|
||||
string regex = 13;
|
||||
string json_schema = 14;
|
||||
string ebnf_grammar = 15;
|
||||
string structural_tag = 16;
|
||||
}
|
||||
|
||||
// Speculative decoding
|
||||
int32 n = 17; // Number of samples
|
||||
|
||||
// Additional parameters
|
||||
int32 min_new_tokens = 18;
|
||||
bool ignore_eos = 19;
|
||||
bool no_stop_trim = 20;
|
||||
optional int32 stream_interval = 21;
|
||||
map<string, float> logit_bias = 22;
|
||||
|
||||
// Custom parameters for extensibility
|
||||
google.protobuf.Struct custom_params = 23;
|
||||
}
|
||||
|
||||
|
||||
// Disaggregated serving parameters
|
||||
message DisaggregatedParams {
|
||||
string bootstrap_host = 1;
|
||||
int32 bootstrap_port = 2;
|
||||
int32 bootstrap_room = 3;
|
||||
}
|
||||
|
||||
// =====================
|
||||
// Generate Request
|
||||
// =====================
|
||||
|
||||
message GenerateRequest {
|
||||
string request_id = 1;
|
||||
|
||||
// Input must be tokenized (no raw text)
|
||||
TokenizedInput tokenized = 2;
|
||||
|
||||
// Multimodal inputs
|
||||
MultimodalInputs mm_inputs = 3;
|
||||
|
||||
// Generation parameters
|
||||
SamplingParams sampling_params = 4;
|
||||
|
||||
// Return options
|
||||
bool return_logprob = 5;
|
||||
int32 logprob_start_len = 6;
|
||||
int32 top_logprobs_num = 7;
|
||||
repeated uint32 token_ids_logprob = 8;
|
||||
bool return_hidden_states = 9;
|
||||
|
||||
// For disaggregated serving
|
||||
DisaggregatedParams disaggregated_params = 10;
|
||||
|
||||
// Custom logit processor (serialized)
|
||||
string custom_logit_processor = 11;
|
||||
|
||||
// Request metadata
|
||||
google.protobuf.Timestamp timestamp = 12;
|
||||
bool log_metrics = 13;
|
||||
|
||||
// Input embeddings (alternative to text/tokens)
|
||||
repeated float input_embeds = 14;
|
||||
|
||||
// LoRA adapter ID (if pre-loaded)
|
||||
string lora_id = 15;
|
||||
|
||||
// Data parallel routing
|
||||
int32 data_parallel_rank = 16;
|
||||
|
||||
// Whether client wants streaming response
|
||||
bool stream = 17;
|
||||
}
|
||||
|
||||
message TokenizedInput {
|
||||
string original_text = 1; // For reference
|
||||
repeated uint32 input_ids = 2;
|
||||
}
|
||||
|
||||
message MultimodalInputs {
|
||||
// Simplified multimodal handling - actual data processed by tokenizer
|
||||
repeated string image_urls = 1;
|
||||
repeated string video_urls = 2;
|
||||
repeated string audio_urls = 3;
|
||||
|
||||
// Pre-processed multimodal features (if available)
|
||||
google.protobuf.Struct processed_features = 4;
|
||||
|
||||
// Raw data for direct processing
|
||||
repeated bytes image_data = 5;
|
||||
repeated bytes video_data = 6;
|
||||
repeated bytes audio_data = 7;
|
||||
|
||||
// Modality metadata
|
||||
repeated string modalities = 8;
|
||||
}
|
||||
|
||||
// =====================
|
||||
// Generate Response
|
||||
// =====================
|
||||
|
||||
message GenerateResponse {
|
||||
string request_id = 1;
|
||||
|
||||
// Response type
|
||||
oneof response {
|
||||
GenerateStreamChunk chunk = 2;
|
||||
GenerateComplete complete = 3;
|
||||
GenerateError error = 4;
|
||||
}
|
||||
}
|
||||
|
||||
message GenerateStreamChunk {
|
||||
// Generated tokens (incremental chunk)
|
||||
repeated uint32 token_ids = 1;
|
||||
|
||||
// Cumulative counts
|
||||
int32 prompt_tokens = 2;
|
||||
int32 completion_tokens = 3;
|
||||
int32 cached_tokens = 4;
|
||||
|
||||
// Output logprobs (if requested) - incremental for streaming
|
||||
OutputLogProbs output_logprobs = 5;
|
||||
|
||||
// Hidden states (if requested)
|
||||
repeated float hidden_states = 6;
|
||||
|
||||
// Input logprobs (if requested) - only in first chunk
|
||||
InputLogProbs input_logprobs = 7;
|
||||
|
||||
// Index for ordering when n>1 (for parallel request multiplexing)
|
||||
uint32 index = 8;
|
||||
}
|
||||
|
||||
message GenerateComplete {
|
||||
// Final output
|
||||
repeated uint32 output_ids = 1;
|
||||
|
||||
// Finish reason as OpenAI-compatible string ("stop", "length", "abort")
|
||||
string finish_reason = 2;
|
||||
|
||||
// Token usage counts
|
||||
int32 prompt_tokens = 3;
|
||||
int32 completion_tokens = 4;
|
||||
int32 cached_tokens = 5;
|
||||
|
||||
// Output logprobs if requested (cumulative)
|
||||
OutputLogProbs output_logprobs = 6;
|
||||
|
||||
// All hidden states if requested
|
||||
repeated HiddenStates all_hidden_states = 7;
|
||||
|
||||
// Matched stop information (for stop sequences)
|
||||
oneof matched_stop {
|
||||
uint32 matched_token_id = 8;
|
||||
string matched_stop_str = 9;
|
||||
}
|
||||
|
||||
// Input logprobs if requested (for prompt tokens)
|
||||
InputLogProbs input_logprobs = 10;
|
||||
|
||||
// Index for ordering when n>1 (for parallel request multiplexing)
|
||||
uint32 index = 11;
|
||||
}
|
||||
|
||||
message GenerateError {
|
||||
string message = 1;
|
||||
string http_status_code = 2;
|
||||
string details = 3;
|
||||
}
|
||||
|
||||
// Output logprobs - all values are present (no None)
|
||||
message OutputLogProbs {
|
||||
repeated float token_logprobs = 1;
|
||||
repeated int32 token_ids = 2;
|
||||
|
||||
// Top logprobs at each position
|
||||
repeated TopLogProbs top_logprobs = 3;
|
||||
}
|
||||
|
||||
// Input logprobs - first token has no logprob (None)
|
||||
message InputLogProbs {
|
||||
repeated InputTokenLogProb token_logprobs = 1;
|
||||
repeated int32 token_ids = 2;
|
||||
|
||||
// Top logprobs at each position
|
||||
repeated TopLogProbs top_logprobs = 3;
|
||||
}
|
||||
|
||||
// Wrapper to represent optional logprob (first input token has no logprob)
|
||||
message InputTokenLogProb {
|
||||
optional float value = 1;
|
||||
}
|
||||
|
||||
message TopLogProbs {
|
||||
repeated float values = 1;
|
||||
repeated int32 token_ids = 2;
|
||||
}
|
||||
|
||||
message HiddenStates {
|
||||
repeated float values = 1;
|
||||
int32 layer = 2;
|
||||
int32 position = 3;
|
||||
}
|
||||
|
||||
// =====================
|
||||
// Embedding Request
|
||||
// =====================
|
||||
|
||||
message EmbedRequest {
|
||||
string request_id = 1;
|
||||
|
||||
// Input must be tokenized (no raw text)
|
||||
TokenizedInput tokenized = 2;
|
||||
|
||||
// Multimodal inputs
|
||||
MultimodalInputs mm_inputs = 4;
|
||||
|
||||
// Dummy sampling params for compatibility
|
||||
// EmbedRequest doesn't use sampling_params
|
||||
SamplingParams sampling_params = 5;
|
||||
|
||||
bool log_metrics = 6;
|
||||
|
||||
// Token type IDs for models that require them
|
||||
repeated int32 token_type_ids = 7;
|
||||
|
||||
// Data parallel routing
|
||||
int32 data_parallel_rank = 8;
|
||||
|
||||
// For cross-encoder requests
|
||||
bool is_cross_encoder = 9;
|
||||
repeated string texts = 10; // For cross-encoder batch
|
||||
}
|
||||
|
||||
message EmbedResponse {
|
||||
string request_id = 1;
|
||||
|
||||
oneof response {
|
||||
EmbedComplete complete = 2;
|
||||
EmbedError error = 3;
|
||||
}
|
||||
}
|
||||
|
||||
message EmbedComplete {
|
||||
repeated float embedding = 1;
|
||||
int32 prompt_tokens = 2;
|
||||
int32 cached_tokens = 3;
|
||||
|
||||
// Additional metadata
|
||||
int32 embedding_dim = 4;
|
||||
|
||||
// For batch embeddings
|
||||
repeated Embedding batch_embeddings = 5;
|
||||
}
|
||||
|
||||
message Embedding {
|
||||
repeated float values = 1;
|
||||
int32 index = 2;
|
||||
}
|
||||
|
||||
message EmbedError {
|
||||
string message = 1;
|
||||
string code = 2;
|
||||
string details = 3;
|
||||
}
|
||||
|
||||
// =====================
|
||||
// Management Operations
|
||||
// =====================
|
||||
|
||||
message HealthCheckRequest {}
|
||||
|
||||
message HealthCheckResponse {
|
||||
bool healthy = 1;
|
||||
string message = 2;
|
||||
}
|
||||
|
||||
message AbortRequest {
|
||||
string request_id = 1;
|
||||
string reason = 2;
|
||||
}
|
||||
|
||||
message AbortResponse {
|
||||
bool success = 1;
|
||||
string message = 2;
|
||||
}
|
||||
|
||||
|
||||
// =====================
|
||||
// Additional Operations (Future)
|
||||
// =====================
|
||||
|
||||
// Load LoRA adapter
|
||||
message LoadLoRARequest {
|
||||
string adapter_id = 1;
|
||||
string adapter_path = 2;
|
||||
int32 rank = 3;
|
||||
}
|
||||
|
||||
message LoadLoRAResponse {
|
||||
bool success = 1;
|
||||
string adapter_id = 2;
|
||||
string message = 3;
|
||||
}
|
||||
|
||||
// Unload LoRA adapter
|
||||
message UnloadLoRARequest {
|
||||
string adapter_id = 1;
|
||||
}
|
||||
|
||||
message UnloadLoRAResponse {
|
||||
bool success = 1;
|
||||
string message = 2;
|
||||
}
|
||||
|
||||
// Update weights
|
||||
message UpdateWeightsRequest {
|
||||
oneof source {
|
||||
string disk_path = 1;
|
||||
bytes tensor_data = 2;
|
||||
string remote_url = 3;
|
||||
}
|
||||
string weight_name = 4;
|
||||
}
|
||||
|
||||
message UpdateWeightsResponse {
|
||||
bool success = 1;
|
||||
string message = 2;
|
||||
}
|
||||
|
||||
// Get internal state for debugging
|
||||
message GetInternalStateRequest {
|
||||
repeated string state_keys = 1;
|
||||
}
|
||||
|
||||
message GetInternalStateResponse {
|
||||
google.protobuf.Struct state = 1;
|
||||
}
|
||||
|
||||
// Set internal state for testing
|
||||
message SetInternalStateRequest {
|
||||
google.protobuf.Struct state = 1;
|
||||
}
|
||||
|
||||
message SetInternalStateResponse {
|
||||
bool success = 1;
|
||||
string message = 2;
|
||||
}
|
||||
|
||||
// =====================
|
||||
// Model and Server Info
|
||||
// =====================
|
||||
|
||||
// Get model information
|
||||
message GetModelInfoRequest {}
|
||||
|
||||
message GetModelInfoResponse {
|
||||
string model_path = 1;
|
||||
string tokenizer_path = 2;
|
||||
bool is_generation = 3;
|
||||
string preferred_sampling_params = 4; // JSON string or empty
|
||||
string weight_version = 5;
|
||||
string served_model_name = 6;
|
||||
int32 max_context_length = 7;
|
||||
int32 vocab_size = 8;
|
||||
bool supports_vision = 9;
|
||||
string model_type = 10;
|
||||
repeated int32 eos_token_ids = 11;
|
||||
int32 pad_token_id = 12;
|
||||
int32 bos_token_id = 13;
|
||||
int32 max_req_input_len = 14;
|
||||
repeated string architectures = 15;
|
||||
|
||||
// Classification model support (from HuggingFace config.json)
|
||||
// id2label maps class indices to label names, e.g., {"0": "negative", "1": "positive"}
|
||||
string id2label_json = 16;
|
||||
// Number of classification labels (0 if not a classifier)
|
||||
int32 num_labels = 17;
|
||||
}
|
||||
|
||||
// Get server information
|
||||
message GetServerInfoRequest {}
|
||||
|
||||
message GetServerInfoResponse {
|
||||
// Server configuration (as structured data)
|
||||
google.protobuf.Struct server_args = 1;
|
||||
|
||||
// Scheduler metrics (from scheduler initialization)
|
||||
google.protobuf.Struct scheduler_info = 2;
|
||||
|
||||
// Runtime state
|
||||
int32 active_requests = 3;
|
||||
bool is_paused = 4;
|
||||
double last_receive_timestamp = 5;
|
||||
double uptime_seconds = 6;
|
||||
|
||||
// Version info
|
||||
string sglang_version = 7;
|
||||
|
||||
// Server metadata
|
||||
string server_type = 8; // "grpc"
|
||||
google.protobuf.Timestamp start_time = 9;
|
||||
|
||||
// Note: internal_states not provided in gRPC mode
|
||||
// Scheduler-side metrics (memory usage, throughput) require
|
||||
// bidirectional communicator infrastructure not available in gRPC.
|
||||
// Use HTTP /get_server_info if scheduler internal state is needed.
|
||||
}
|
||||
|
||||
// =====================
|
||||
// Load Metrics (v1/loads)
|
||||
// =====================
|
||||
|
||||
message GetLoadsRequest {
|
||||
// Optional: filter to specific DP rank
|
||||
optional int32 dp_rank = 1;
|
||||
|
||||
// Sections to include: core, memory, spec, lora, disagg, queues, all
|
||||
repeated string include = 2;
|
||||
}
|
||||
|
||||
message GetLoadsResponse {
|
||||
// ISO 8601 timestamp
|
||||
string timestamp = 1;
|
||||
|
||||
// SGLang version
|
||||
string version = 2;
|
||||
|
||||
// Number of DP ranks
|
||||
int32 dp_rank_count = 3;
|
||||
|
||||
// Per-DP-rank load metrics
|
||||
repeated SchedulerLoad loads = 4;
|
||||
|
||||
// Aggregate metrics across all DP ranks
|
||||
AggregateMetrics aggregate = 5;
|
||||
}
|
||||
|
||||
message SchedulerLoad {
|
||||
int32 dp_rank = 1;
|
||||
|
||||
// Core metrics (always included)
|
||||
int32 num_running_reqs = 2;
|
||||
int32 num_waiting_reqs = 3;
|
||||
int32 num_total_reqs = 4;
|
||||
int32 num_used_tokens = 5;
|
||||
int32 max_total_num_tokens = 6;
|
||||
double token_usage = 7;
|
||||
double gen_throughput = 8;
|
||||
double cache_hit_rate = 9;
|
||||
double utilization = 10;
|
||||
int32 max_running_requests = 11;
|
||||
|
||||
// Optional sections
|
||||
optional MemoryMetrics memory = 12;
|
||||
optional SpeculativeMetrics speculative = 13;
|
||||
optional LoRAMetrics lora = 14;
|
||||
optional DisaggregationMetrics disaggregation = 15;
|
||||
optional QueueMetrics queues = 16;
|
||||
}
|
||||
|
||||
message MemoryMetrics {
|
||||
double weight_gb = 1;
|
||||
double kv_cache_gb = 2;
|
||||
double graph_gb = 3;
|
||||
int32 token_capacity = 4;
|
||||
}
|
||||
|
||||
message SpeculativeMetrics {
|
||||
double accept_length = 1;
|
||||
double accept_rate = 2;
|
||||
}
|
||||
|
||||
message LoRAMetrics {
|
||||
int32 slots_used = 1;
|
||||
int32 slots_total = 2;
|
||||
double utilization = 3;
|
||||
}
|
||||
|
||||
message DisaggregationMetrics {
|
||||
string mode = 1; // "prefill", "decode", or "null"
|
||||
int32 prefill_prealloc_queue_reqs = 2;
|
||||
int32 prefill_inflight_queue_reqs = 3;
|
||||
int32 decode_prealloc_queue_reqs = 4;
|
||||
int32 decode_transfer_queue_reqs = 5;
|
||||
int32 decode_retracted_queue_reqs = 6;
|
||||
double kv_transfer_speed_gb_s = 7;
|
||||
double kv_transfer_latency_ms = 8;
|
||||
}
|
||||
|
||||
message QueueMetrics {
|
||||
int32 waiting = 1;
|
||||
int32 grammar = 2;
|
||||
int32 paused = 3;
|
||||
int32 retracted = 4;
|
||||
}
|
||||
|
||||
message AggregateMetrics {
|
||||
int32 total_running_reqs = 1;
|
||||
int32 total_waiting_reqs = 2;
|
||||
int32 total_reqs = 3;
|
||||
double avg_token_usage = 4;
|
||||
double avg_throughput = 5;
|
||||
double avg_utilization = 6;
|
||||
}
|
||||
Reference in New Issue
Block a user