refactor: replace local proto compilation with smg-grpc-proto package (#18682)

2026-04-20 22:39:01 +00:00 · 2026-02-12 05:29:24 -08:00
parent c59b9223e6
commit 92c5749f41
11 changed files with 22 additions and 981 deletions
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -32,19 +32,3 @@ jobs:
          extensions: h,c,cpp,hpp,cu,cuh,cc
          clangFormatVersion: 18
          style: file
-
-      - name: Check proto files are in sync
-        run: |
-          if ! diff -q python/sglang/srt/grpc/sglang_scheduler.proto sgl-model-gateway/src/proto/sglang_scheduler.proto; then
-            echo "❌ ERROR: Proto files are out of sync!"
-            echo ""
-            echo "The following files must be kept identical:"
-            echo "  - python/sglang/srt/grpc/sglang_scheduler.proto"
-            echo "  - sgl-model-gateway/src/proto/sglang_scheduler.proto"
-            echo ""
-            echo "Please ensure both files have the same content."
-            echo ""
-            echo "Differences:"
-            diff python/sglang/srt/grpc/sglang_scheduler.proto sgl-model-gateway/src/proto/sglang_scheduler.proto || true
-            exit 1
-          fi
--- a/.gitignore
+++ b/.gitignore
@@ -262,10 +262,6 @@ outputs/
 # setuptools-scm generated version file
 python/sglang/_version.py

-# Generated protobuf files (regenerate during wheel build or with compile_proto.py)
-python/sglang/srt/grpc/*_pb2.py
-python/sglang/srt/grpc/*_pb2_grpc.py
-python/sglang/srt/grpc/*_pb2.pyi

 # MUSA section
 # Generated source files by torchada
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -1,5 +1,5 @@
 [build-system]
-requires = ["setuptools>=61.0", "setuptools-scm>=8.0", "wheel", "grpcio-tools==1.75.1"]
+requires = ["setuptools>=61.0", "setuptools-scm>=8.0", "wheel"]
 build-backend = "setuptools.build_meta"

 [project]
@@ -76,10 +76,10 @@ dependencies = [
  "uvloop",
  "xgrammar==0.1.27",

-  "grpcio==1.75.1", # keep it align with compile_proto.py
-  "grpcio-tools==1.75.1", # keep it align with compile_proto.py
-  "grpcio-reflection==1.75.1", # required by srt/entrypoints/grpc_server.py
-  "grpcio-health-checking==1.75.1", # required for Kubernetes gRPC health probes
+  "smg-grpc-proto>=0.3.3",
+  "grpcio>=1.78.0",
+  "grpcio-reflection>=1.78.0",
+  "grpcio-health-checking>=1.78.0",
 ]

 [[tool.uv.index]]
--- a/python/pyproject_cpu.toml
+++ b/python/pyproject_cpu.toml
@@ -1,6 +1,6 @@
 # https://docs.sglang.io/platforms/cpu_server.html
 [build-system]
-requires = ["setuptools>=61.0", "setuptools-scm>=8.0", "wheel", "grpcio-tools==1.75.1"]
+requires = ["setuptools>=61.0", "setuptools-scm>=8.0", "wheel"]
 build-backend = "setuptools.build_meta"

 [project]
@@ -68,9 +68,9 @@ dependencies = [
  "uvicorn",
  "uvloop",
  "xgrammar==0.1.27",
-  "grpcio==1.75.1", # keep it align with compile_proto.py
-  "grpcio-tools==1.75.1", # keep it align with compile_proto.py
-  "grpcio-reflection==1.75.1", # required by srt/entrypoints/grpc_server.py
+  "smg-grpc-proto>=0.3.3",
+  "grpcio>=1.78.0",
+  "grpcio-reflection>=1.78.0",
 ]

 [project.optional-dependencies]
--- a/python/pyproject_npu.toml
+++ b/python/pyproject_npu.toml
@@ -1,5 +1,5 @@
 [build-system]
-requires = ["setuptools>=61.0", "setuptools-scm>=8.0", "wheel", "grpcio-tools==1.75.1"]
+requires = ["setuptools>=61.0", "setuptools-scm>=8.0", "wheel"]
 build-backend = "setuptools.build_meta"

 [project]
@@ -61,9 +61,9 @@ dependencies = [
  "uvicorn",
  "uvloop",
  "xgrammar==0.1.27",
-  "grpcio==1.75.1", # keep it align with compile_proto.py
-  "grpcio-tools==1.75.1", # keep it align with compile_proto.py
-  "grpcio-reflection==1.75.1", # required by srt/entrypoints/grpc_server.py
+  "smg-grpc-proto>=0.3.3",
+  "grpcio>=1.78.0",
+  "grpcio-reflection>=1.78.0",
 ]

 [project.optional-dependencies]
--- a/python/pyproject_other.toml
+++ b/python/pyproject_other.toml
@@ -1,5 +1,5 @@
 [build-system]
-requires = ["setuptools>=61.0", "setuptools-scm>=8.0", "wheel", "grpcio-tools==1.75.1"]
+requires = ["setuptools>=61.0", "setuptools-scm>=8.0", "wheel"]
 build-backend = "setuptools.build_meta"

 [project]
@@ -63,9 +63,9 @@ runtime_common = [
  "uvicorn",
  "uvloop",
  "xgrammar==0.1.27",
-  "grpcio==1.75.1", # keep it align with compile_proto.py
-  "grpcio-tools==1.75.1", # keep it align with compile_proto.py
-  "grpcio-reflection==1.75.1", # required by srt/entrypoints/grpc_server.py
+  "smg-grpc-proto>=0.3.3",
+  "grpcio>=1.78.0",
+  "grpcio-reflection>=1.78.0",
 ]

 tracing = [
--- a/python/pyproject_xpu.toml
+++ b/python/pyproject_xpu.toml
@@ -1,5 +1,5 @@
 [build-system]
-requires = ["setuptools>=61.0", "setuptools-scm>=8.0", "wheel", "grpcio-tools==1.75.1"]
+requires = ["setuptools>=61.0", "setuptools-scm>=8.0", "wheel"]
 build-backend = "setuptools.build_meta"

 [project]
@@ -67,9 +67,9 @@ dependencies = [
  "uvicorn",
  "uvloop",
  # "xgrammar==0.1.24", , xgrammar depends on CUDA PyTorch and Triton only
-  "grpcio==1.75.1", # keep it align with compile_proto.py
-  "grpcio-tools==1.75.1", # keep it align with compile_proto.py
-  "grpcio-reflection==1.75.1", # required by srt/entrypoints/grpc_server.py
+  "smg-grpc-proto>=0.3.3",
+  "grpcio>=1.78.0",
+  "grpcio-reflection>=1.78.0",
 ]

 [project.optional-dependencies]
--- a/python/setup.py
+++ b/python/setup.py
@@ -1,125 +0,0 @@
-"""
-Custom setup.py for SGLang that compiles protobuf files during build.
-
-This file works alongside pyproject.toml. It hooks into the build process
-to automatically generate gRPC/protobuf Python files from .proto sources
-when building the wheel or doing editable installs.
-"""
-
-import os
-from pathlib import Path
-
-from setuptools import setup
-from setuptools.command.build_py import build_py
-from setuptools.command.develop import develop
-from setuptools.command.egg_info import egg_info
-from setuptools.errors import SetupError
-
-PROTO_SOURCE = "sglang/srt/grpc/sglang_scheduler.proto"
-
-
-def compile_proto():
-    """Compile the protobuf file to Python using grpc_tools.protoc."""
-    proto_path = Path(__file__).parent / PROTO_SOURCE
-
-    if not proto_path.exists():
-        print(f"Warning: Proto file not found at {proto_path}, skipping generation")
-        return
-
-    print(f"Generating gRPC files from {PROTO_SOURCE}")
-
-    output_dir = proto_path.parent
-    proto_dir = proto_path.parent
-
-    # Import grpc_tools.protoc directly instead of running as subprocess.
-    # This ensures we use the grpcio-tools installed in the build environment,
-    # since sys.executable may point to the main Python interpreter in
-    # pip's isolated build environments.
-    try:
-        import grpc_tools
-        from grpc_tools import protoc
-    except ImportError as e:
-        raise SetupError(
-            f"Failed to import grpc_tools: {e}. "
-            "Ensure grpcio-tools is listed in build-system.requires in pyproject.toml"
-        )
-
-    # Get the path to well-known proto files bundled with grpcio-tools
-    # (e.g., google/protobuf/timestamp.proto, google/protobuf/struct.proto)
-    grpc_tools_proto_path = Path(grpc_tools.__file__).parent / "_proto"
-
-    # Build the protoc arguments (protoc.main expects argv-style list)
-    args = [
-        "protoc",  # argv[0] is the program name
-        f"-I{proto_dir}",
-        f"-I{grpc_tools_proto_path}",  # Include path for well-known protos
-        f"--python_out={output_dir}",
-        f"--grpc_python_out={output_dir}",
-        f"--pyi_out={output_dir}",
-        str(proto_dir / proto_path.name),
-    ]
-
-    print(f"Running protoc with args: {args[1:]}")
-
-    # Save and restore cwd since protoc may change it
-    original_cwd = os.getcwd()
-    try:
-        result = protoc.main(args)
-        if result != 0:
-            raise SetupError(f"protoc failed with exit code {result}")
-    finally:
-        os.chdir(original_cwd)
-
-    # Fix imports in generated grpc file (change absolute to relative imports)
-    _fix_imports(output_dir, proto_path.stem)
-
-    print(f"Successfully generated gRPC files in {output_dir}")
-
-
-def _fix_imports(output_dir: Path, proto_stem: str):
-    """Fix imports in generated files to use relative imports."""
-    grpc_file = output_dir / f"{proto_stem}_pb2_grpc.py"
-
-    if grpc_file.exists():
-        content = grpc_file.read_text()
-        # Change absolute import to relative import
-        old_import = f"import {proto_stem}_pb2"
-        new_import = f"from . import {proto_stem}_pb2"
-
-        if old_import in content:
-            content = content.replace(old_import, new_import)
-            grpc_file.write_text(content)
-            print("Fixed imports in generated gRPC file")
-
-
-class BuildPyWithProto(build_py):
-    """Build Python modules, generating gRPC files from .proto sources first."""
-
-    def run(self):
-        compile_proto()
-        super().run()
-
-
-class DevelopWithProto(develop):
-    """Editable install with gRPC file generation."""
-
-    def run(self):
-        compile_proto()
-        super().run()
-
-
-class EggInfoWithProto(egg_info):
-    """Egg info generation with gRPC file generation."""
-
-    def run(self):
-        compile_proto()
-        super().run()
-
-
-setup(
-    cmdclass={
-        "build_py": BuildPyWithProto,
-        "develop": DevelopWithProto,
-        "egg_info": EggInfoWithProto,
-    },
-)
--- a/python/sglang/srt/entrypoints/grpc_server.py
+++ b/python/sglang/srt/entrypoints/grpc_server.py
@@ -21,11 +21,11 @@ from google.protobuf.struct_pb2 import Struct
 from google.protobuf.timestamp_pb2 import Timestamp
 from grpc_health.v1 import health_pb2_grpc
 from grpc_reflection.v1alpha import reflection
+from smg_grpc_proto import sglang_scheduler_pb2, sglang_scheduler_pb2_grpc

 import sglang
 from sglang.srt.configs.model_config import ModelConfig
 from sglang.srt.disaggregation.utils import FAKE_BOOTSTRAP_HOST, DisaggregationMode
-from sglang.srt.grpc import sglang_scheduler_pb2, sglang_scheduler_pb2_grpc
 from sglang.srt.grpc.grpc_request_manager import GrpcRequestManager
 from sglang.srt.grpc.health_servicer import SGLangHealthServicer
 from sglang.srt.grpc.scheduler_launcher import launch_scheduler_process_only
--- a/python/sglang/srt/grpc/compile_proto.py
+++ b/python/sglang/srt/grpc/compile_proto.py
@@ -1,248 +0,0 @@
-#!/usr/bin/env python3
-"""
-Compile protobuf files for SGLang gRPC server.
-
-This script compiles .proto files to Python code using grpc_tools.protoc.
-It generates:
- *_pb2.py (protobuf message classes)
- *_pb2_grpc.py (gRPC service classes)
- *_pb2.pyi (type hints for mypy/IDEs)
-
-Usage:
-    python compile_proto.py [--check] [--proto-file PROTO_FILE]
-
-Options:
-    --check         Check if regeneration is needed (exit 1 if needed)
-    --proto-file    Specify proto file (default: sglang_scheduler.proto)
-
-### Install Dependencies
-pip install "grpcio==1.75.1" "grpcio-tools==1.75.1"
-
-Please make sure to use the same version of grpcio and grpcio-tools specified in pyproject.toml
-otherwise update the versions specified in pyproject.toml
-
-### Run Script
-cd python/sglang/srt/grpc
-python compile_proto.py
-"""
-
-
-import argparse
-import subprocess
-import sys
-from importlib.metadata import version
-from pathlib import Path
-
-GRPC_VERSION = "1.75.1"
-
-
-def get_file_mtime(path: Path) -> float:
-    """Get file modification time, return 0 if file doesn't exist."""
-    try:
-        return path.stat().st_mtime
-    except FileNotFoundError:
-        return 0.0
-
-
-def check_regeneration_needed(proto_file: Path, output_dir: Path) -> bool:
-    """Check if proto files are newer than generated files."""
-    proto_mtime = get_file_mtime(proto_file)
-
-    generated_files = [
-        output_dir / f"{proto_file.stem}_pb2.py",
-        output_dir / f"{proto_file.stem}_pb2_grpc.py",
-        output_dir / f"{proto_file.stem}_pb2.pyi",
-    ]
-
-    for gen_file in generated_files:
-        if get_file_mtime(gen_file) < proto_mtime:
-            return True
-
-    return False
-
-
-def compile_proto(proto_file: Path, output_dir: Path, verbose: bool = True) -> bool:
-    """Compile the protobuf file to Python."""
-
-    if not proto_file.exists():
-        print(f"Error: Proto file not found: {proto_file}")
-        return False
-
-    if verbose:
-        print(f"Found proto file: {proto_file}")
-
-    # Check if grpc_tools is available
-    try:
-        import grpc_tools.protoc  # noqa: F401
-    except ImportError:
-        print("Error: grpcio-tools not installed")
-        print(
-            f'Install with: pip install "grpcio-tools=={GRPC_VERSION}" "grpcio=={GRPC_VERSION}"'
-        )
-        return False
-
-    grpc_tools_version = version("grpcio-tools")
-    grpc_version = version("grpcio")
-    if grpc_tools_version != GRPC_VERSION or grpc_version != GRPC_VERSION:
-        raise RuntimeError(
-            f"Error: grpcio-tools version {grpc_tools_version} and grpcio version {grpc_version} detected, but {GRPC_VERSION} is required."
-        )
-
-    # Compile command
-    cmd = [
-        sys.executable,
-        "-m",
-        "grpc_tools.protoc",
-        f"-I{proto_file.parent}",
-        f"--python_out={output_dir}",
-        f"--grpc_python_out={output_dir}",
-        f"--pyi_out={output_dir}",  # Generate type stubs
-        str(proto_file.name),
-    ]
-
-    if verbose:
-        print(f"Running: {' '.join(cmd)}")
-
-    # Run protoc
-    result = subprocess.run(cmd, capture_output=True, text=True, cwd=proto_file.parent)
-
-    if result.returncode != 0:
-        print(f"Error compiling proto:")
-        print(result.stderr)
-        if result.stdout:
-            print(result.stdout)
-        return False
-
-    # Verify generated files exist
-    generated_files = [
-        f"{proto_file.stem}_pb2.py",
-        f"{proto_file.stem}_pb2_grpc.py",
-        f"{proto_file.stem}_pb2.pyi",
-    ]
-
-    missing_files = []
-    for gen_file in generated_files:
-        if not (output_dir / gen_file).exists():
-            missing_files.append(gen_file)
-
-    if missing_files:
-        print(f"Error: Expected generated files not found: {missing_files}")
-        return False
-
-    if verbose:
-        print("Successfully compiled protobuf files:")
-        for gen_file in generated_files:
-            print(f"  - {output_dir}/{gen_file}")
-
-    # Fix imports in generated files
-    fix_imports(output_dir, proto_file.stem, verbose)
-
-    return True
-
-
-def fix_imports(output_dir: Path, proto_stem: str, verbose: bool = True) -> None:
-    """Fix imports in generated files to use relative imports."""
-    grpc_file = output_dir / f"{proto_stem}_pb2_grpc.py"
-
-    if grpc_file.exists():
-        content = grpc_file.read_text()
-        # Change absolute import to relative import
-        old_import = f"import {proto_stem}_pb2"
-        new_import = f"from . import {proto_stem}_pb2"
-
-        if old_import in content:
-            content = content.replace(old_import, new_import)
-            grpc_file.write_text(content)
-            if verbose:
-                print("Fixed imports in generated files")
-
-
-def add_generation_header(output_dir: Path, proto_stem: str) -> None:
-    """Add header to generated files indicating they are auto-generated."""
-    header = """# This file is auto-generated. Do not edit manually.
-# Regenerate with: python compile_proto.py
-
-"""
-
-    files_to_update = [f"{proto_stem}_pb2.py", f"{proto_stem}_pb2_grpc.py"]
-
-    for filename in files_to_update:
-        file_path = output_dir / filename
-        if file_path.exists():
-            content = file_path.read_text()
-            if not content.startswith("# This file is auto-generated"):
-                file_path.write_text(header + content)
-
-
-def main():
-    """Main entry point."""
-    parser = argparse.ArgumentParser(
-        description="Compile protobuf files for SGLang gRPC server",
-        formatter_class=argparse.RawDescriptionHelpFormatter,
-        epilog=__doc__,
-    )
-
-    parser.add_argument(
-        "--check",
-        action="store_true",
-        help="Check if regeneration is needed (exit 1 if needed)",
-    )
-
-    parser.add_argument(
-        "--proto-file",
-        type=str,
-        default="sglang_scheduler.proto",
-        help="Proto file to compile (default: sglang_scheduler.proto)",
-    )
-
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action="store_true",
-        default=True,
-        help="Verbose output (default: True)",
-    )
-
-    parser.add_argument(
-        "-q", "--quiet", action="store_true", help="Quiet mode (overrides verbose)"
-    )
-
-    args = parser.parse_args()
-
-    # Handle verbosity
-    verbose = args.verbose and not args.quiet
-
-    # Get paths
-    script_dir = Path(__file__).parent
-    proto_file = script_dir / args.proto_file
-    output_dir = script_dir
-
-    # Check mode
-    if args.check:
-        if check_regeneration_needed(proto_file, output_dir):
-            if verbose:
-                print("Proto files need regeneration")
-            sys.exit(1)
-        else:
-            if verbose:
-                print("Generated files are up to date")
-            sys.exit(0)
-
-    # Compile mode
-    success = compile_proto(proto_file, output_dir, verbose)
-
-    if success:
-        # Add generation headers
-        add_generation_header(output_dir, proto_file.stem)
-
-        if verbose:
-            print("\n✅ Protobuf compilation successful!")
-            print("Generated files are ready for use")
-    else:
-        if verbose:
-            print("\n❌ Protobuf compilation failed!")
-        sys.exit(1)
-
-
-if __name__ == "__main__":
-    main()
--- a/python/sglang/srt/grpc/sglang_scheduler.proto
+++ b/python/sglang/srt/grpc/sglang_scheduler.proto
@@ -1,566 +0,0 @@
-syntax = "proto3";
-
-package sglang.grpc.scheduler;
-
-import "google/protobuf/timestamp.proto";
-import "google/protobuf/struct.proto";
-
-// Service definition for SGLang scheduler communication
-// This protocol bridges the Rust router and Python scheduler
-service SglangScheduler {
-  // Submit a generation request (supports streaming)
-  rpc Generate(GenerateRequest) returns (stream GenerateResponse);
-
-  // Submit an embedding request
-  rpc Embed(EmbedRequest) returns (EmbedResponse);
-
-  // Health check and metrics
-  rpc HealthCheck(HealthCheckRequest) returns (HealthCheckResponse);
-
-  // Abort a running request
-  rpc Abort(AbortRequest) returns (AbortResponse);
-
-  // Get model information
-  rpc GetModelInfo(GetModelInfoRequest) returns (GetModelInfoResponse);
-
-  // Get server information
-  rpc GetServerInfo(GetServerInfoRequest) returns (GetServerInfoResponse);
-
-  // Get comprehensive load metrics
-  rpc GetLoads(GetLoadsRequest) returns (GetLoadsResponse);
-
-}
-
-// =====================
-// Common Types
-// =====================
-
-// Sampling parameters matching SGLang's SamplingParams
-//
-// IMPORTANT: Do not use SamplingParams::default() directly!
-// The proto3 defaults (0 for numeric fields) do NOT match the semantic defaults
-// (temperature=1.0, top_p=1.0, top_k=-1, etc.). Always construct with explicit values
-// or use the conversion functions in sglang_scheduler.rs / grpc_server.py.
-message SamplingParams {
-  float temperature = 1;
-  float top_p = 2;
-  int32 top_k = 3;
-  float min_p = 4;
-  float frequency_penalty = 5;
-  float presence_penalty = 6;
-  float repetition_penalty = 7;
-
-  optional int32 max_new_tokens = 8;
-  repeated string stop = 9;
-  repeated uint32 stop_token_ids = 10;
-  bool skip_special_tokens = 11;
-  bool spaces_between_special_tokens = 12;
-
-  // Structured generation
-  oneof constraint {
-    string regex = 13;
-    string json_schema = 14;
-    string ebnf_grammar = 15;
-    string structural_tag = 16;
-  }
-
-  // Speculative decoding
-  int32 n = 17;  // Number of samples
-
-  // Additional parameters
-  int32 min_new_tokens = 18;
-  bool ignore_eos = 19;
-  bool no_stop_trim = 20;
-  optional int32 stream_interval = 21;
-  map<string, float> logit_bias = 22;
-
-  // Custom parameters for extensibility
-  google.protobuf.Struct custom_params = 23;
-}
-
-
-// Disaggregated serving parameters
-message DisaggregatedParams {
-  string bootstrap_host = 1;
-  int32 bootstrap_port = 2;
-  int32 bootstrap_room = 3;
-}
-
-// =====================
-// Generate Request
-// =====================
-
-message GenerateRequest {
-  string request_id = 1;
-
-  // Input must be tokenized (no raw text)
-  TokenizedInput tokenized = 2;
-
-  // Multimodal inputs
-  MultimodalInputs mm_inputs = 3;
-
-  // Generation parameters
-  SamplingParams sampling_params = 4;
-
-  // Return options
-  bool return_logprob = 5;
-  int32 logprob_start_len = 6;
-  int32 top_logprobs_num = 7;
-  repeated uint32 token_ids_logprob = 8;
-  bool return_hidden_states = 9;
-
-  // For disaggregated serving
-  DisaggregatedParams disaggregated_params = 10;
-
-  // Custom logit processor (serialized)
-  string custom_logit_processor = 11;
-
-  // Request metadata
-  google.protobuf.Timestamp timestamp = 12;
-  bool log_metrics = 13;
-
-  // Input embeddings (alternative to text/tokens)
-  repeated float input_embeds = 14;
-
-  // LoRA adapter ID (if pre-loaded)
-  string lora_id = 15;
-
-  // Data parallel routing
-  int32 data_parallel_rank = 16;
-
-  // Whether client wants streaming response
-  bool stream = 17;
-}
-
-message TokenizedInput {
-  string original_text = 1;  // For reference
-  repeated uint32 input_ids = 2;
-}
-
-message MultimodalInputs {
-  // Simplified multimodal handling - actual data processed by tokenizer
-  repeated string image_urls = 1;
-  repeated string video_urls = 2;
-  repeated string audio_urls = 3;
-
-  // Pre-processed multimodal features (if available)
-  google.protobuf.Struct processed_features = 4;
-
-  // Raw data for direct processing
-  repeated bytes image_data = 5;
-  repeated bytes video_data = 6;
-  repeated bytes audio_data = 7;
-
-  // Modality metadata
-  repeated string modalities = 8;
-}
-
-// =====================
-// Generate Response
-// =====================
-
-message GenerateResponse {
-  string request_id = 1;
-
-  // Response type
-  oneof response {
-    GenerateStreamChunk chunk = 2;
-    GenerateComplete complete = 3;
-    GenerateError error = 4;
-  }
-}
-
-message GenerateStreamChunk {
-  // Generated tokens (incremental chunk)
-  repeated uint32 token_ids = 1;
-
-  // Cumulative counts
-  int32 prompt_tokens = 2;
-  int32 completion_tokens = 3;
-  int32 cached_tokens = 4;
-
-  // Output logprobs (if requested) - incremental for streaming
-  OutputLogProbs output_logprobs = 5;
-
-  // Hidden states (if requested)
-  repeated float hidden_states = 6;
-
-  // Input logprobs (if requested) - only in first chunk
-  InputLogProbs input_logprobs = 7;
-
-  // Index for ordering when n>1 (for parallel request multiplexing)
-  uint32 index = 8;
-}
-
-message GenerateComplete {
-  // Final output
-  repeated uint32 output_ids = 1;
-
-  // Finish reason as OpenAI-compatible string ("stop", "length", "abort")
-  string finish_reason = 2;
-
-  // Token usage counts
-  int32 prompt_tokens = 3;
-  int32 completion_tokens = 4;
-  int32 cached_tokens = 5;
-
-  // Output logprobs if requested (cumulative)
-  OutputLogProbs output_logprobs = 6;
-
-  // All hidden states if requested
-  repeated HiddenStates all_hidden_states = 7;
-
-  // Matched stop information (for stop sequences)
-  oneof matched_stop {
-    uint32 matched_token_id = 8;
-    string matched_stop_str = 9;
-  }
-
-  // Input logprobs if requested (for prompt tokens)
-  InputLogProbs input_logprobs = 10;
-
-  // Index for ordering when n>1 (for parallel request multiplexing)
-  uint32 index = 11;
-}
-
-message GenerateError {
-  string message = 1;
-  string http_status_code = 2;
-  string details = 3;
-}
-
-// Output logprobs - all values are present (no None)
-message OutputLogProbs {
-  repeated float token_logprobs = 1;
-  repeated int32 token_ids = 2;
-
-  // Top logprobs at each position
-  repeated TopLogProbs top_logprobs = 3;
-}
-
-// Input logprobs - first token has no logprob (None)
-message InputLogProbs {
-  repeated InputTokenLogProb token_logprobs = 1;
-  repeated int32 token_ids = 2;
-
-  // Top logprobs at each position
-  repeated TopLogProbs top_logprobs = 3;
-}
-
-// Wrapper to represent optional logprob (first input token has no logprob)
-message InputTokenLogProb {
-  optional float value = 1;
-}
-
-message TopLogProbs {
-  repeated float values = 1;
-  repeated int32 token_ids = 2;
-}
-
-message HiddenStates {
-  repeated float values = 1;
-  int32 layer = 2;
-  int32 position = 3;
-}
-
-// =====================
-// Embedding Request
-// =====================
-
-message EmbedRequest {
-  string request_id = 1;
-
-  // Input must be tokenized (no raw text)
-  TokenizedInput tokenized = 2;
-
-  // Multimodal inputs
-  MultimodalInputs mm_inputs = 4;
-
-  // Dummy sampling params for compatibility
-  // EmbedRequest doesn't use sampling_params
-  SamplingParams sampling_params = 5;
-
-  bool log_metrics = 6;
-
-  // Token type IDs for models that require them
-  repeated int32 token_type_ids = 7;
-
-  // Data parallel routing
-  int32 data_parallel_rank = 8;
-
-  // For cross-encoder requests
-  bool is_cross_encoder = 9;
-  repeated string texts = 10;  // For cross-encoder batch
-}
-
-message EmbedResponse {
-  string request_id = 1;
-
-  oneof response {
-    EmbedComplete complete = 2;
-    EmbedError error = 3;
-  }
-}
-
-message EmbedComplete {
-  repeated float embedding = 1;
-  int32 prompt_tokens = 2;
-  int32 cached_tokens = 3;
-
-  // Additional metadata
-  int32 embedding_dim = 4;
-
-  // For batch embeddings
-  repeated Embedding batch_embeddings = 5;
-}
-
-message Embedding {
-  repeated float values = 1;
-  int32 index = 2;
-}
-
-message EmbedError {
-  string message = 1;
-  string code = 2;
-  string details = 3;
-}
-
-// =====================
-// Management Operations
-// =====================
-
-message HealthCheckRequest {}
-
-message HealthCheckResponse {
-  bool healthy = 1;
-  string message = 2;
-}
-
-message AbortRequest {
-  string request_id = 1;
-  string reason = 2;
-}
-
-message AbortResponse {
-  bool success = 1;
-  string message = 2;
-}
-
-
-// =====================
-// Additional Operations (Future)
-// =====================
-
-// Load LoRA adapter
-message LoadLoRARequest {
-  string adapter_id = 1;
-  string adapter_path = 2;
-  int32 rank = 3;
-}
-
-message LoadLoRAResponse {
-  bool success = 1;
-  string adapter_id = 2;
-  string message = 3;
-}
-
-// Unload LoRA adapter
-message UnloadLoRARequest {
-  string adapter_id = 1;
-}
-
-message UnloadLoRAResponse {
-  bool success = 1;
-  string message = 2;
-}
-
-// Update weights
-message UpdateWeightsRequest {
-  oneof source {
-    string disk_path = 1;
-    bytes tensor_data = 2;
-    string remote_url = 3;
-  }
-  string weight_name = 4;
-}
-
-message UpdateWeightsResponse {
-  bool success = 1;
-  string message = 2;
-}
-
-// Get internal state for debugging
-message GetInternalStateRequest {
-  repeated string state_keys = 1;
-}
-
-message GetInternalStateResponse {
-  google.protobuf.Struct state = 1;
-}
-
-// Set internal state for testing
-message SetInternalStateRequest {
-  google.protobuf.Struct state = 1;
-}
-
-message SetInternalStateResponse {
-  bool success = 1;
-  string message = 2;
-}
-
-// =====================
-// Model and Server Info
-// =====================
-
-// Get model information
-message GetModelInfoRequest {}
-
-message GetModelInfoResponse {
-  string model_path = 1;
-  string tokenizer_path = 2;
-  bool is_generation = 3;
-  string preferred_sampling_params = 4;  // JSON string or empty
-  string weight_version = 5;
-  string served_model_name = 6;
-  int32 max_context_length = 7;
-  int32 vocab_size = 8;
-  bool supports_vision = 9;
-  string model_type = 10;
-  repeated int32 eos_token_ids = 11;
-  int32 pad_token_id = 12;
-  int32 bos_token_id = 13;
-  int32 max_req_input_len = 14;
-  repeated string architectures = 15;
-
-  // Classification model support (from HuggingFace config.json)
-  // id2label maps class indices to label names, e.g., {"0": "negative", "1": "positive"}
-  string id2label_json = 16;
-  // Number of classification labels (0 if not a classifier)
-  int32 num_labels = 17;
-}
-
-// Get server information
-message GetServerInfoRequest {}
-
-message GetServerInfoResponse {
-  // Server configuration (as structured data)
-  google.protobuf.Struct server_args = 1;
-
-  // Scheduler metrics (from scheduler initialization)
-  google.protobuf.Struct scheduler_info = 2;
-
-  // Runtime state
-  int32 active_requests = 3;
-  bool is_paused = 4;
-  double last_receive_timestamp = 5;
-  double uptime_seconds = 6;
-
-  // Version info
-  string sglang_version = 7;
-
-  // Server metadata
-  string server_type = 8;  // "grpc"
-  google.protobuf.Timestamp start_time = 9;
-
-  // Note: internal_states not provided in gRPC mode
-  // Scheduler-side metrics (memory usage, throughput) require
-  // bidirectional communicator infrastructure not available in gRPC.
-  // Use HTTP /get_server_info if scheduler internal state is needed.
-}
-
-// =====================
-// Load Metrics (v1/loads)
-// =====================
-
-message GetLoadsRequest {
-  // Optional: filter to specific DP rank
-  optional int32 dp_rank = 1;
-
-  // Sections to include: core, memory, spec, lora, disagg, queues, all
-  repeated string include = 2;
-}
-
-message GetLoadsResponse {
-  // ISO 8601 timestamp
-  string timestamp = 1;
-
-  // SGLang version
-  string version = 2;
-
-  // Number of DP ranks
-  int32 dp_rank_count = 3;
-
-  // Per-DP-rank load metrics
-  repeated SchedulerLoad loads = 4;
-
-  // Aggregate metrics across all DP ranks
-  AggregateMetrics aggregate = 5;
-}
-
-message SchedulerLoad {
-  int32 dp_rank = 1;
-
-  // Core metrics (always included)
-  int32 num_running_reqs = 2;
-  int32 num_waiting_reqs = 3;
-  int32 num_total_reqs = 4;
-  int32 num_used_tokens = 5;
-  int32 max_total_num_tokens = 6;
-  double token_usage = 7;
-  double gen_throughput = 8;
-  double cache_hit_rate = 9;
-  double utilization = 10;
-  int32 max_running_requests = 11;
-
-  // Optional sections
-  optional MemoryMetrics memory = 12;
-  optional SpeculativeMetrics speculative = 13;
-  optional LoRAMetrics lora = 14;
-  optional DisaggregationMetrics disaggregation = 15;
-  optional QueueMetrics queues = 16;
-}
-
-message MemoryMetrics {
-  double weight_gb = 1;
-  double kv_cache_gb = 2;
-  double graph_gb = 3;
-  int32 token_capacity = 4;
-}
-
-message SpeculativeMetrics {
-  double accept_length = 1;
-  double accept_rate = 2;
-}
-
-message LoRAMetrics {
-  int32 slots_used = 1;
-  int32 slots_total = 2;
-  double utilization = 3;
-}
-
-message DisaggregationMetrics {
-  string mode = 1;  // "prefill", "decode", or "null"
-  int32 prefill_prealloc_queue_reqs = 2;
-  int32 prefill_inflight_queue_reqs = 3;
-  int32 decode_prealloc_queue_reqs = 4;
-  int32 decode_transfer_queue_reqs = 5;
-  int32 decode_retracted_queue_reqs = 6;
-  double kv_transfer_speed_gb_s = 7;
-  double kv_transfer_latency_ms = 8;
-}
-
-message QueueMetrics {
-  int32 waiting = 1;
-  int32 grammar = 2;
-  int32 paused = 3;
-  int32 retracted = 4;
-}
-
-message AggregateMetrics {
-  int32 total_running_reqs = 1;
-  int32 total_waiting_reqs = 2;
-  int32 total_reqs = 3;
-  double avg_token_usage = 4;
-  double avg_throughput = 5;
-  double avg_utilization = 6;
-}