refactor: replace local proto compilation with smg-grpc-proto package (#18682)

This commit is contained in:
Simo Lin
2026-02-12 05:29:24 -08:00
committed by GitHub
parent c59b9223e6
commit 92c5749f41
11 changed files with 22 additions and 981 deletions

View File

@@ -32,19 +32,3 @@ jobs:
extensions: h,c,cpp,hpp,cu,cuh,cc
clangFormatVersion: 18
style: file
- name: Check proto files are in sync
run: |
if ! diff -q python/sglang/srt/grpc/sglang_scheduler.proto sgl-model-gateway/src/proto/sglang_scheduler.proto; then
echo "❌ ERROR: Proto files are out of sync!"
echo ""
echo "The following files must be kept identical:"
echo " - python/sglang/srt/grpc/sglang_scheduler.proto"
echo " - sgl-model-gateway/src/proto/sglang_scheduler.proto"
echo ""
echo "Please ensure both files have the same content."
echo ""
echo "Differences:"
diff python/sglang/srt/grpc/sglang_scheduler.proto sgl-model-gateway/src/proto/sglang_scheduler.proto || true
exit 1
fi

4
.gitignore vendored
View File

@@ -262,10 +262,6 @@ outputs/
# setuptools-scm generated version file
python/sglang/_version.py
# Generated protobuf files (regenerate during wheel build or with compile_proto.py)
python/sglang/srt/grpc/*_pb2.py
python/sglang/srt/grpc/*_pb2_grpc.py
python/sglang/srt/grpc/*_pb2.pyi
# MUSA section
# Generated source files by torchada

View File

@@ -1,5 +1,5 @@
[build-system]
requires = ["setuptools>=61.0", "setuptools-scm>=8.0", "wheel", "grpcio-tools==1.75.1"]
requires = ["setuptools>=61.0", "setuptools-scm>=8.0", "wheel"]
build-backend = "setuptools.build_meta"
[project]
@@ -76,10 +76,10 @@ dependencies = [
"uvloop",
"xgrammar==0.1.27",
"grpcio==1.75.1", # keep it align with compile_proto.py
"grpcio-tools==1.75.1", # keep it align with compile_proto.py
"grpcio-reflection==1.75.1", # required by srt/entrypoints/grpc_server.py
"grpcio-health-checking==1.75.1", # required for Kubernetes gRPC health probes
"smg-grpc-proto>=0.3.3",
"grpcio>=1.78.0",
"grpcio-reflection>=1.78.0",
"grpcio-health-checking>=1.78.0",
]
[[tool.uv.index]]

View File

@@ -1,6 +1,6 @@
# https://docs.sglang.io/platforms/cpu_server.html
[build-system]
requires = ["setuptools>=61.0", "setuptools-scm>=8.0", "wheel", "grpcio-tools==1.75.1"]
requires = ["setuptools>=61.0", "setuptools-scm>=8.0", "wheel"]
build-backend = "setuptools.build_meta"
[project]
@@ -68,9 +68,9 @@ dependencies = [
"uvicorn",
"uvloop",
"xgrammar==0.1.27",
"grpcio==1.75.1", # keep it align with compile_proto.py
"grpcio-tools==1.75.1", # keep it align with compile_proto.py
"grpcio-reflection==1.75.1", # required by srt/entrypoints/grpc_server.py
"smg-grpc-proto>=0.3.3",
"grpcio>=1.78.0",
"grpcio-reflection>=1.78.0",
]
[project.optional-dependencies]

View File

@@ -1,5 +1,5 @@
[build-system]
requires = ["setuptools>=61.0", "setuptools-scm>=8.0", "wheel", "grpcio-tools==1.75.1"]
requires = ["setuptools>=61.0", "setuptools-scm>=8.0", "wheel"]
build-backend = "setuptools.build_meta"
[project]
@@ -61,9 +61,9 @@ dependencies = [
"uvicorn",
"uvloop",
"xgrammar==0.1.27",
"grpcio==1.75.1", # keep it align with compile_proto.py
"grpcio-tools==1.75.1", # keep it align with compile_proto.py
"grpcio-reflection==1.75.1", # required by srt/entrypoints/grpc_server.py
"smg-grpc-proto>=0.3.3",
"grpcio>=1.78.0",
"grpcio-reflection>=1.78.0",
]
[project.optional-dependencies]

View File

@@ -1,5 +1,5 @@
[build-system]
requires = ["setuptools>=61.0", "setuptools-scm>=8.0", "wheel", "grpcio-tools==1.75.1"]
requires = ["setuptools>=61.0", "setuptools-scm>=8.0", "wheel"]
build-backend = "setuptools.build_meta"
[project]
@@ -63,9 +63,9 @@ runtime_common = [
"uvicorn",
"uvloop",
"xgrammar==0.1.27",
"grpcio==1.75.1", # keep it align with compile_proto.py
"grpcio-tools==1.75.1", # keep it align with compile_proto.py
"grpcio-reflection==1.75.1", # required by srt/entrypoints/grpc_server.py
"smg-grpc-proto>=0.3.3",
"grpcio>=1.78.0",
"grpcio-reflection>=1.78.0",
]
tracing = [

View File

@@ -1,5 +1,5 @@
[build-system]
requires = ["setuptools>=61.0", "setuptools-scm>=8.0", "wheel", "grpcio-tools==1.75.1"]
requires = ["setuptools>=61.0", "setuptools-scm>=8.0", "wheel"]
build-backend = "setuptools.build_meta"
[project]
@@ -67,9 +67,9 @@ dependencies = [
"uvicorn",
"uvloop",
# "xgrammar==0.1.24", , xgrammar depends on CUDA PyTorch and Triton only
"grpcio==1.75.1", # keep it align with compile_proto.py
"grpcio-tools==1.75.1", # keep it align with compile_proto.py
"grpcio-reflection==1.75.1", # required by srt/entrypoints/grpc_server.py
"smg-grpc-proto>=0.3.3",
"grpcio>=1.78.0",
"grpcio-reflection>=1.78.0",
]
[project.optional-dependencies]

View File

@@ -1,125 +0,0 @@
"""
Custom setup.py for SGLang that compiles protobuf files during build.
This file works alongside pyproject.toml. It hooks into the build process
to automatically generate gRPC/protobuf Python files from .proto sources
when building the wheel or doing editable installs.
"""
import os
from pathlib import Path
from setuptools import setup
from setuptools.command.build_py import build_py
from setuptools.command.develop import develop
from setuptools.command.egg_info import egg_info
from setuptools.errors import SetupError
PROTO_SOURCE = "sglang/srt/grpc/sglang_scheduler.proto"
def compile_proto():
"""Compile the protobuf file to Python using grpc_tools.protoc."""
proto_path = Path(__file__).parent / PROTO_SOURCE
if not proto_path.exists():
print(f"Warning: Proto file not found at {proto_path}, skipping generation")
return
print(f"Generating gRPC files from {PROTO_SOURCE}")
output_dir = proto_path.parent
proto_dir = proto_path.parent
# Import grpc_tools.protoc directly instead of running as subprocess.
# This ensures we use the grpcio-tools installed in the build environment,
# since sys.executable may point to the main Python interpreter in
# pip's isolated build environments.
try:
import grpc_tools
from grpc_tools import protoc
except ImportError as e:
raise SetupError(
f"Failed to import grpc_tools: {e}. "
"Ensure grpcio-tools is listed in build-system.requires in pyproject.toml"
)
# Get the path to well-known proto files bundled with grpcio-tools
# (e.g., google/protobuf/timestamp.proto, google/protobuf/struct.proto)
grpc_tools_proto_path = Path(grpc_tools.__file__).parent / "_proto"
# Build the protoc arguments (protoc.main expects argv-style list)
args = [
"protoc", # argv[0] is the program name
f"-I{proto_dir}",
f"-I{grpc_tools_proto_path}", # Include path for well-known protos
f"--python_out={output_dir}",
f"--grpc_python_out={output_dir}",
f"--pyi_out={output_dir}",
str(proto_dir / proto_path.name),
]
print(f"Running protoc with args: {args[1:]}")
# Save and restore cwd since protoc may change it
original_cwd = os.getcwd()
try:
result = protoc.main(args)
if result != 0:
raise SetupError(f"protoc failed with exit code {result}")
finally:
os.chdir(original_cwd)
# Fix imports in generated grpc file (change absolute to relative imports)
_fix_imports(output_dir, proto_path.stem)
print(f"Successfully generated gRPC files in {output_dir}")
def _fix_imports(output_dir: Path, proto_stem: str):
"""Fix imports in generated files to use relative imports."""
grpc_file = output_dir / f"{proto_stem}_pb2_grpc.py"
if grpc_file.exists():
content = grpc_file.read_text()
# Change absolute import to relative import
old_import = f"import {proto_stem}_pb2"
new_import = f"from . import {proto_stem}_pb2"
if old_import in content:
content = content.replace(old_import, new_import)
grpc_file.write_text(content)
print("Fixed imports in generated gRPC file")
class BuildPyWithProto(build_py):
"""Build Python modules, generating gRPC files from .proto sources first."""
def run(self):
compile_proto()
super().run()
class DevelopWithProto(develop):
"""Editable install with gRPC file generation."""
def run(self):
compile_proto()
super().run()
class EggInfoWithProto(egg_info):
"""Egg info generation with gRPC file generation."""
def run(self):
compile_proto()
super().run()
setup(
cmdclass={
"build_py": BuildPyWithProto,
"develop": DevelopWithProto,
"egg_info": EggInfoWithProto,
},
)

View File

@@ -21,11 +21,11 @@ from google.protobuf.struct_pb2 import Struct
from google.protobuf.timestamp_pb2 import Timestamp
from grpc_health.v1 import health_pb2_grpc
from grpc_reflection.v1alpha import reflection
from smg_grpc_proto import sglang_scheduler_pb2, sglang_scheduler_pb2_grpc
import sglang
from sglang.srt.configs.model_config import ModelConfig
from sglang.srt.disaggregation.utils import FAKE_BOOTSTRAP_HOST, DisaggregationMode
from sglang.srt.grpc import sglang_scheduler_pb2, sglang_scheduler_pb2_grpc
from sglang.srt.grpc.grpc_request_manager import GrpcRequestManager
from sglang.srt.grpc.health_servicer import SGLangHealthServicer
from sglang.srt.grpc.scheduler_launcher import launch_scheduler_process_only

View File

@@ -1,248 +0,0 @@
#!/usr/bin/env python3
"""
Compile protobuf files for SGLang gRPC server.
This script compiles .proto files to Python code using grpc_tools.protoc.
It generates:
- *_pb2.py (protobuf message classes)
- *_pb2_grpc.py (gRPC service classes)
- *_pb2.pyi (type hints for mypy/IDEs)
Usage:
python compile_proto.py [--check] [--proto-file PROTO_FILE]
Options:
--check Check if regeneration is needed (exit 1 if needed)
--proto-file Specify proto file (default: sglang_scheduler.proto)
### Install Dependencies
pip install "grpcio==1.75.1" "grpcio-tools==1.75.1"
Please make sure to use the same version of grpcio and grpcio-tools specified in pyproject.toml
otherwise update the versions specified in pyproject.toml
### Run Script
cd python/sglang/srt/grpc
python compile_proto.py
"""
import argparse
import subprocess
import sys
from importlib.metadata import version
from pathlib import Path
GRPC_VERSION = "1.75.1"
def get_file_mtime(path: Path) -> float:
"""Get file modification time, return 0 if file doesn't exist."""
try:
return path.stat().st_mtime
except FileNotFoundError:
return 0.0
def check_regeneration_needed(proto_file: Path, output_dir: Path) -> bool:
"""Check if proto files are newer than generated files."""
proto_mtime = get_file_mtime(proto_file)
generated_files = [
output_dir / f"{proto_file.stem}_pb2.py",
output_dir / f"{proto_file.stem}_pb2_grpc.py",
output_dir / f"{proto_file.stem}_pb2.pyi",
]
for gen_file in generated_files:
if get_file_mtime(gen_file) < proto_mtime:
return True
return False
def compile_proto(proto_file: Path, output_dir: Path, verbose: bool = True) -> bool:
"""Compile the protobuf file to Python."""
if not proto_file.exists():
print(f"Error: Proto file not found: {proto_file}")
return False
if verbose:
print(f"Found proto file: {proto_file}")
# Check if grpc_tools is available
try:
import grpc_tools.protoc # noqa: F401
except ImportError:
print("Error: grpcio-tools not installed")
print(
f'Install with: pip install "grpcio-tools=={GRPC_VERSION}" "grpcio=={GRPC_VERSION}"'
)
return False
grpc_tools_version = version("grpcio-tools")
grpc_version = version("grpcio")
if grpc_tools_version != GRPC_VERSION or grpc_version != GRPC_VERSION:
raise RuntimeError(
f"Error: grpcio-tools version {grpc_tools_version} and grpcio version {grpc_version} detected, but {GRPC_VERSION} is required."
)
# Compile command
cmd = [
sys.executable,
"-m",
"grpc_tools.protoc",
f"-I{proto_file.parent}",
f"--python_out={output_dir}",
f"--grpc_python_out={output_dir}",
f"--pyi_out={output_dir}", # Generate type stubs
str(proto_file.name),
]
if verbose:
print(f"Running: {' '.join(cmd)}")
# Run protoc
result = subprocess.run(cmd, capture_output=True, text=True, cwd=proto_file.parent)
if result.returncode != 0:
print(f"Error compiling proto:")
print(result.stderr)
if result.stdout:
print(result.stdout)
return False
# Verify generated files exist
generated_files = [
f"{proto_file.stem}_pb2.py",
f"{proto_file.stem}_pb2_grpc.py",
f"{proto_file.stem}_pb2.pyi",
]
missing_files = []
for gen_file in generated_files:
if not (output_dir / gen_file).exists():
missing_files.append(gen_file)
if missing_files:
print(f"Error: Expected generated files not found: {missing_files}")
return False
if verbose:
print("Successfully compiled protobuf files:")
for gen_file in generated_files:
print(f" - {output_dir}/{gen_file}")
# Fix imports in generated files
fix_imports(output_dir, proto_file.stem, verbose)
return True
def fix_imports(output_dir: Path, proto_stem: str, verbose: bool = True) -> None:
"""Fix imports in generated files to use relative imports."""
grpc_file = output_dir / f"{proto_stem}_pb2_grpc.py"
if grpc_file.exists():
content = grpc_file.read_text()
# Change absolute import to relative import
old_import = f"import {proto_stem}_pb2"
new_import = f"from . import {proto_stem}_pb2"
if old_import in content:
content = content.replace(old_import, new_import)
grpc_file.write_text(content)
if verbose:
print("Fixed imports in generated files")
def add_generation_header(output_dir: Path, proto_stem: str) -> None:
"""Add header to generated files indicating they are auto-generated."""
header = """# This file is auto-generated. Do not edit manually.
# Regenerate with: python compile_proto.py
"""
files_to_update = [f"{proto_stem}_pb2.py", f"{proto_stem}_pb2_grpc.py"]
for filename in files_to_update:
file_path = output_dir / filename
if file_path.exists():
content = file_path.read_text()
if not content.startswith("# This file is auto-generated"):
file_path.write_text(header + content)
def main():
"""Main entry point."""
parser = argparse.ArgumentParser(
description="Compile protobuf files for SGLang gRPC server",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=__doc__,
)
parser.add_argument(
"--check",
action="store_true",
help="Check if regeneration is needed (exit 1 if needed)",
)
parser.add_argument(
"--proto-file",
type=str,
default="sglang_scheduler.proto",
help="Proto file to compile (default: sglang_scheduler.proto)",
)
parser.add_argument(
"-v",
"--verbose",
action="store_true",
default=True,
help="Verbose output (default: True)",
)
parser.add_argument(
"-q", "--quiet", action="store_true", help="Quiet mode (overrides verbose)"
)
args = parser.parse_args()
# Handle verbosity
verbose = args.verbose and not args.quiet
# Get paths
script_dir = Path(__file__).parent
proto_file = script_dir / args.proto_file
output_dir = script_dir
# Check mode
if args.check:
if check_regeneration_needed(proto_file, output_dir):
if verbose:
print("Proto files need regeneration")
sys.exit(1)
else:
if verbose:
print("Generated files are up to date")
sys.exit(0)
# Compile mode
success = compile_proto(proto_file, output_dir, verbose)
if success:
# Add generation headers
add_generation_header(output_dir, proto_file.stem)
if verbose:
print("\n✅ Protobuf compilation successful!")
print("Generated files are ready for use")
else:
if verbose:
print("\n❌ Protobuf compilation failed!")
sys.exit(1)
if __name__ == "__main__":
main()

View File

@@ -1,566 +0,0 @@
syntax = "proto3";
package sglang.grpc.scheduler;
import "google/protobuf/timestamp.proto";
import "google/protobuf/struct.proto";
// Service definition for SGLang scheduler communication
// This protocol bridges the Rust router and Python scheduler
service SglangScheduler {
// Submit a generation request (supports streaming)
rpc Generate(GenerateRequest) returns (stream GenerateResponse);
// Submit an embedding request
rpc Embed(EmbedRequest) returns (EmbedResponse);
// Health check and metrics
rpc HealthCheck(HealthCheckRequest) returns (HealthCheckResponse);
// Abort a running request
rpc Abort(AbortRequest) returns (AbortResponse);
// Get model information
rpc GetModelInfo(GetModelInfoRequest) returns (GetModelInfoResponse);
// Get server information
rpc GetServerInfo(GetServerInfoRequest) returns (GetServerInfoResponse);
// Get comprehensive load metrics
rpc GetLoads(GetLoadsRequest) returns (GetLoadsResponse);
}
// =====================
// Common Types
// =====================
// Sampling parameters matching SGLang's SamplingParams
//
// IMPORTANT: Do not use SamplingParams::default() directly!
// The proto3 defaults (0 for numeric fields) do NOT match the semantic defaults
// (temperature=1.0, top_p=1.0, top_k=-1, etc.). Always construct with explicit values
// or use the conversion functions in sglang_scheduler.rs / grpc_server.py.
message SamplingParams {
float temperature = 1;
float top_p = 2;
int32 top_k = 3;
float min_p = 4;
float frequency_penalty = 5;
float presence_penalty = 6;
float repetition_penalty = 7;
optional int32 max_new_tokens = 8;
repeated string stop = 9;
repeated uint32 stop_token_ids = 10;
bool skip_special_tokens = 11;
bool spaces_between_special_tokens = 12;
// Structured generation
oneof constraint {
string regex = 13;
string json_schema = 14;
string ebnf_grammar = 15;
string structural_tag = 16;
}
// Speculative decoding
int32 n = 17; // Number of samples
// Additional parameters
int32 min_new_tokens = 18;
bool ignore_eos = 19;
bool no_stop_trim = 20;
optional int32 stream_interval = 21;
map<string, float> logit_bias = 22;
// Custom parameters for extensibility
google.protobuf.Struct custom_params = 23;
}
// Disaggregated serving parameters
message DisaggregatedParams {
string bootstrap_host = 1;
int32 bootstrap_port = 2;
int32 bootstrap_room = 3;
}
// =====================
// Generate Request
// =====================
message GenerateRequest {
string request_id = 1;
// Input must be tokenized (no raw text)
TokenizedInput tokenized = 2;
// Multimodal inputs
MultimodalInputs mm_inputs = 3;
// Generation parameters
SamplingParams sampling_params = 4;
// Return options
bool return_logprob = 5;
int32 logprob_start_len = 6;
int32 top_logprobs_num = 7;
repeated uint32 token_ids_logprob = 8;
bool return_hidden_states = 9;
// For disaggregated serving
DisaggregatedParams disaggregated_params = 10;
// Custom logit processor (serialized)
string custom_logit_processor = 11;
// Request metadata
google.protobuf.Timestamp timestamp = 12;
bool log_metrics = 13;
// Input embeddings (alternative to text/tokens)
repeated float input_embeds = 14;
// LoRA adapter ID (if pre-loaded)
string lora_id = 15;
// Data parallel routing
int32 data_parallel_rank = 16;
// Whether client wants streaming response
bool stream = 17;
}
message TokenizedInput {
string original_text = 1; // For reference
repeated uint32 input_ids = 2;
}
message MultimodalInputs {
// Simplified multimodal handling - actual data processed by tokenizer
repeated string image_urls = 1;
repeated string video_urls = 2;
repeated string audio_urls = 3;
// Pre-processed multimodal features (if available)
google.protobuf.Struct processed_features = 4;
// Raw data for direct processing
repeated bytes image_data = 5;
repeated bytes video_data = 6;
repeated bytes audio_data = 7;
// Modality metadata
repeated string modalities = 8;
}
// =====================
// Generate Response
// =====================
message GenerateResponse {
string request_id = 1;
// Response type
oneof response {
GenerateStreamChunk chunk = 2;
GenerateComplete complete = 3;
GenerateError error = 4;
}
}
message GenerateStreamChunk {
// Generated tokens (incremental chunk)
repeated uint32 token_ids = 1;
// Cumulative counts
int32 prompt_tokens = 2;
int32 completion_tokens = 3;
int32 cached_tokens = 4;
// Output logprobs (if requested) - incremental for streaming
OutputLogProbs output_logprobs = 5;
// Hidden states (if requested)
repeated float hidden_states = 6;
// Input logprobs (if requested) - only in first chunk
InputLogProbs input_logprobs = 7;
// Index for ordering when n>1 (for parallel request multiplexing)
uint32 index = 8;
}
message GenerateComplete {
// Final output
repeated uint32 output_ids = 1;
// Finish reason as OpenAI-compatible string ("stop", "length", "abort")
string finish_reason = 2;
// Token usage counts
int32 prompt_tokens = 3;
int32 completion_tokens = 4;
int32 cached_tokens = 5;
// Output logprobs if requested (cumulative)
OutputLogProbs output_logprobs = 6;
// All hidden states if requested
repeated HiddenStates all_hidden_states = 7;
// Matched stop information (for stop sequences)
oneof matched_stop {
uint32 matched_token_id = 8;
string matched_stop_str = 9;
}
// Input logprobs if requested (for prompt tokens)
InputLogProbs input_logprobs = 10;
// Index for ordering when n>1 (for parallel request multiplexing)
uint32 index = 11;
}
message GenerateError {
string message = 1;
string http_status_code = 2;
string details = 3;
}
// Output logprobs - all values are present (no None)
message OutputLogProbs {
repeated float token_logprobs = 1;
repeated int32 token_ids = 2;
// Top logprobs at each position
repeated TopLogProbs top_logprobs = 3;
}
// Input logprobs - first token has no logprob (None)
message InputLogProbs {
repeated InputTokenLogProb token_logprobs = 1;
repeated int32 token_ids = 2;
// Top logprobs at each position
repeated TopLogProbs top_logprobs = 3;
}
// Wrapper to represent optional logprob (first input token has no logprob)
message InputTokenLogProb {
optional float value = 1;
}
message TopLogProbs {
repeated float values = 1;
repeated int32 token_ids = 2;
}
message HiddenStates {
repeated float values = 1;
int32 layer = 2;
int32 position = 3;
}
// =====================
// Embedding Request
// =====================
message EmbedRequest {
string request_id = 1;
// Input must be tokenized (no raw text)
TokenizedInput tokenized = 2;
// Multimodal inputs
MultimodalInputs mm_inputs = 4;
// Dummy sampling params for compatibility
// EmbedRequest doesn't use sampling_params
SamplingParams sampling_params = 5;
bool log_metrics = 6;
// Token type IDs for models that require them
repeated int32 token_type_ids = 7;
// Data parallel routing
int32 data_parallel_rank = 8;
// For cross-encoder requests
bool is_cross_encoder = 9;
repeated string texts = 10; // For cross-encoder batch
}
message EmbedResponse {
string request_id = 1;
oneof response {
EmbedComplete complete = 2;
EmbedError error = 3;
}
}
message EmbedComplete {
repeated float embedding = 1;
int32 prompt_tokens = 2;
int32 cached_tokens = 3;
// Additional metadata
int32 embedding_dim = 4;
// For batch embeddings
repeated Embedding batch_embeddings = 5;
}
message Embedding {
repeated float values = 1;
int32 index = 2;
}
message EmbedError {
string message = 1;
string code = 2;
string details = 3;
}
// =====================
// Management Operations
// =====================
message HealthCheckRequest {}
message HealthCheckResponse {
bool healthy = 1;
string message = 2;
}
message AbortRequest {
string request_id = 1;
string reason = 2;
}
message AbortResponse {
bool success = 1;
string message = 2;
}
// =====================
// Additional Operations (Future)
// =====================
// Load LoRA adapter
message LoadLoRARequest {
string adapter_id = 1;
string adapter_path = 2;
int32 rank = 3;
}
message LoadLoRAResponse {
bool success = 1;
string adapter_id = 2;
string message = 3;
}
// Unload LoRA adapter
message UnloadLoRARequest {
string adapter_id = 1;
}
message UnloadLoRAResponse {
bool success = 1;
string message = 2;
}
// Update weights
message UpdateWeightsRequest {
oneof source {
string disk_path = 1;
bytes tensor_data = 2;
string remote_url = 3;
}
string weight_name = 4;
}
message UpdateWeightsResponse {
bool success = 1;
string message = 2;
}
// Get internal state for debugging
message GetInternalStateRequest {
repeated string state_keys = 1;
}
message GetInternalStateResponse {
google.protobuf.Struct state = 1;
}
// Set internal state for testing
message SetInternalStateRequest {
google.protobuf.Struct state = 1;
}
message SetInternalStateResponse {
bool success = 1;
string message = 2;
}
// =====================
// Model and Server Info
// =====================
// Get model information
message GetModelInfoRequest {}
message GetModelInfoResponse {
string model_path = 1;
string tokenizer_path = 2;
bool is_generation = 3;
string preferred_sampling_params = 4; // JSON string or empty
string weight_version = 5;
string served_model_name = 6;
int32 max_context_length = 7;
int32 vocab_size = 8;
bool supports_vision = 9;
string model_type = 10;
repeated int32 eos_token_ids = 11;
int32 pad_token_id = 12;
int32 bos_token_id = 13;
int32 max_req_input_len = 14;
repeated string architectures = 15;
// Classification model support (from HuggingFace config.json)
// id2label maps class indices to label names, e.g., {"0": "negative", "1": "positive"}
string id2label_json = 16;
// Number of classification labels (0 if not a classifier)
int32 num_labels = 17;
}
// Get server information
message GetServerInfoRequest {}
message GetServerInfoResponse {
// Server configuration (as structured data)
google.protobuf.Struct server_args = 1;
// Scheduler metrics (from scheduler initialization)
google.protobuf.Struct scheduler_info = 2;
// Runtime state
int32 active_requests = 3;
bool is_paused = 4;
double last_receive_timestamp = 5;
double uptime_seconds = 6;
// Version info
string sglang_version = 7;
// Server metadata
string server_type = 8; // "grpc"
google.protobuf.Timestamp start_time = 9;
// Note: internal_states not provided in gRPC mode
// Scheduler-side metrics (memory usage, throughput) require
// bidirectional communicator infrastructure not available in gRPC.
// Use HTTP /get_server_info if scheduler internal state is needed.
}
// =====================
// Load Metrics (v1/loads)
// =====================
message GetLoadsRequest {
// Optional: filter to specific DP rank
optional int32 dp_rank = 1;
// Sections to include: core, memory, spec, lora, disagg, queues, all
repeated string include = 2;
}
message GetLoadsResponse {
// ISO 8601 timestamp
string timestamp = 1;
// SGLang version
string version = 2;
// Number of DP ranks
int32 dp_rank_count = 3;
// Per-DP-rank load metrics
repeated SchedulerLoad loads = 4;
// Aggregate metrics across all DP ranks
AggregateMetrics aggregate = 5;
}
message SchedulerLoad {
int32 dp_rank = 1;
// Core metrics (always included)
int32 num_running_reqs = 2;
int32 num_waiting_reqs = 3;
int32 num_total_reqs = 4;
int32 num_used_tokens = 5;
int32 max_total_num_tokens = 6;
double token_usage = 7;
double gen_throughput = 8;
double cache_hit_rate = 9;
double utilization = 10;
int32 max_running_requests = 11;
// Optional sections
optional MemoryMetrics memory = 12;
optional SpeculativeMetrics speculative = 13;
optional LoRAMetrics lora = 14;
optional DisaggregationMetrics disaggregation = 15;
optional QueueMetrics queues = 16;
}
message MemoryMetrics {
double weight_gb = 1;
double kv_cache_gb = 2;
double graph_gb = 3;
int32 token_capacity = 4;
}
message SpeculativeMetrics {
double accept_length = 1;
double accept_rate = 2;
}
message LoRAMetrics {
int32 slots_used = 1;
int32 slots_total = 2;
double utilization = 3;
}
message DisaggregationMetrics {
string mode = 1; // "prefill", "decode", or "null"
int32 prefill_prealloc_queue_reqs = 2;
int32 prefill_inflight_queue_reqs = 3;
int32 decode_prealloc_queue_reqs = 4;
int32 decode_transfer_queue_reqs = 5;
int32 decode_retracted_queue_reqs = 6;
double kv_transfer_speed_gb_s = 7;
double kv_transfer_latency_ms = 8;
}
message QueueMetrics {
int32 waiting = 1;
int32 grammar = 2;
int32 paused = 3;
int32 retracted = 4;
}
message AggregateMetrics {
int32 total_running_reqs = 1;
int32 total_waiting_reqs = 2;
int32 total_reqs = 3;
double avg_token_usage = 4;
double avg_throughput = 5;
double avg_utilization = 6;
}