mirror of
https://github.com/kvcache-ai/sglang.git
synced 2026-06-30 03:37:51 +00:00
1239 lines
42 KiB
JSON
1239 lines
42 KiB
JSON
{
|
|
"$schema": "https://mintlify.com/docs.json",
|
|
"theme": "aspen",
|
|
"name": "SGLang Documentation",
|
|
"seo": {
|
|
"metatags": {
|
|
"google-site-verification": "bX3ofyYQhraIpAYf4DpyZQXZO_G4xLR_RqeBAKnJA7g"
|
|
}
|
|
},
|
|
"redirects": [
|
|
{
|
|
"source": "/docs/references/learn_more",
|
|
"destination": "/"
|
|
},
|
|
{
|
|
"source": "/cookbook",
|
|
"destination": "/cookbook/intro"
|
|
},
|
|
{
|
|
"source": "/whl",
|
|
"destination": "https://sgl-project.github.io/whl/",
|
|
"permanent": false
|
|
},
|
|
{
|
|
"source": "/whl/:path*",
|
|
"destination": "https://sgl-project.github.io/whl/:path*",
|
|
"permanent": false
|
|
},
|
|
{
|
|
"source": "/sglang-omni",
|
|
"destination": "https://sgl-project.github.io/sglang-omni/",
|
|
"permanent": false
|
|
},
|
|
{
|
|
"source": "/sglang-omni/:path*",
|
|
"destination": "https://sgl-project.github.io/sglang-omni/:path*",
|
|
"permanent": false
|
|
},
|
|
{
|
|
"source": "/SpecForge",
|
|
"destination": "https://sgl-project.github.io/SpecForge/",
|
|
"permanent": false
|
|
},
|
|
{
|
|
"source": "/SpecForge/:path*",
|
|
"destination": "https://sgl-project.github.io/SpecForge/:path*",
|
|
"permanent": false
|
|
},
|
|
{
|
|
"source": "/specforge",
|
|
"destination": "https://sgl-project.github.io/SpecForge/",
|
|
"permanent": false
|
|
},
|
|
{
|
|
"source": "/specforge/:path*",
|
|
"destination": "https://sgl-project.github.io/SpecForge/:path*",
|
|
"permanent": false
|
|
},
|
|
{
|
|
"source": "/index.html",
|
|
"destination": "/"
|
|
},
|
|
{
|
|
"source": "/advanced_features/adaptive_speculative_decoding.html",
|
|
"destination": "/docs/advanced_features/adaptive_speculative_decoding"
|
|
},
|
|
{
|
|
"source": "/advanced_features/attention_backend.html",
|
|
"destination": "/docs/advanced_features/attention_backend"
|
|
},
|
|
{
|
|
"source": "/advanced_features/breakable_cuda_graph.html",
|
|
"destination": "/docs/advanced_features/breakable_cuda_graph"
|
|
},
|
|
{
|
|
"source": "/advanced_features/checkpoint_engine.html",
|
|
"destination": "/docs/advanced_features/checkpoint_engine"
|
|
},
|
|
{
|
|
"source": "/advanced_features/cuda_graph_for_multi_modal_encoder.html",
|
|
"destination": "/docs/advanced_features/cuda_graph_for_multi_modal_encoder"
|
|
},
|
|
{
|
|
"source": "/advanced_features/deterministic_inference.html",
|
|
"destination": "/docs/advanced_features/deterministic_inference"
|
|
},
|
|
{
|
|
"source": "/advanced_features/dp_dpa_smg_guide.html",
|
|
"destination": "/docs/advanced_features/dp_dpa_smg_guide"
|
|
},
|
|
{
|
|
"source": "/advanced_features/dp_for_multi_modal_encoder.html",
|
|
"destination": "/docs/advanced_features/dp_for_multi_modal_encoder"
|
|
},
|
|
{
|
|
"source": "/advanced_features/epd_disaggregation.html",
|
|
"destination": "/docs/advanced_features/epd_disaggregation"
|
|
},
|
|
{
|
|
"source": "/advanced_features/expert_parallelism.html",
|
|
"destination": "/docs/advanced_features/expert_parallelism"
|
|
},
|
|
{
|
|
"source": "/advanced_features/forward_hooks.html",
|
|
"destination": "/docs/advanced_features/forward_hooks"
|
|
},
|
|
{
|
|
"source": "/advanced_features/hicache.html",
|
|
"destination": "/docs/advanced_features/hicache"
|
|
},
|
|
{
|
|
"source": "/advanced_features/hicache_best_practices.html",
|
|
"destination": "/docs/advanced_features/hicache_best_practices"
|
|
},
|
|
{
|
|
"source": "/advanced_features/hicache_design.html",
|
|
"destination": "/docs/advanced_features/hicache_design"
|
|
},
|
|
{
|
|
"source": "/advanced_features/hicache_storage_runtime_attach_detach.html",
|
|
"destination": "/docs/advanced_features/hicache_storage_runtime_attach_detach"
|
|
},
|
|
{
|
|
"source": "/advanced_features/hisparse_guide.html",
|
|
"destination": "/docs/advanced_features/hisparse_guide"
|
|
},
|
|
{
|
|
"source": "/advanced_features/hyperparameter_tuning.html",
|
|
"destination": "/docs/advanced_features/hyperparameter_tuning"
|
|
},
|
|
{
|
|
"source": "/advanced_features/lora.html",
|
|
"destination": "/docs/advanced_features/lora"
|
|
},
|
|
{
|
|
"source": "/advanced_features/object_storage.html",
|
|
"destination": "/docs/advanced_features/object_storage"
|
|
},
|
|
{
|
|
"source": "/advanced_features/observability.html",
|
|
"destination": "/docs/advanced_features/observability"
|
|
},
|
|
{
|
|
"source": "/advanced_features/pd_disaggregation.html",
|
|
"destination": "/docs/advanced_features/pd_disaggregation"
|
|
},
|
|
{
|
|
"source": "/advanced_features/piecewise_cuda_graph.html",
|
|
"destination": "/docs/advanced_features/piecewise_cuda_graph"
|
|
},
|
|
{
|
|
"source": "/advanced_features/pipeline_parallelism.html",
|
|
"destination": "/docs/advanced_features/pipeline_parallelism"
|
|
},
|
|
{
|
|
"source": "/advanced_features/quantization.html",
|
|
"destination": "/docs/advanced_features/quantization"
|
|
},
|
|
{
|
|
"source": "/advanced_features/quantized_kv_cache.html",
|
|
"destination": "/docs/advanced_features/quantized_kv_cache"
|
|
},
|
|
{
|
|
"source": "/advanced_features/rfork.html",
|
|
"destination": "/docs/advanced_features/rfork"
|
|
},
|
|
{
|
|
"source": "/advanced_features/separate_reasoning.html",
|
|
"destination": "/docs/advanced_features/separate_reasoning"
|
|
},
|
|
{
|
|
"source": "/advanced_features/server_arguments.html",
|
|
"destination": "/docs/advanced_features/server_arguments"
|
|
},
|
|
{
|
|
"source": "/advanced_features/sgl_model_gateway.html",
|
|
"destination": "/docs/advanced_features/sgl_model_gateway"
|
|
},
|
|
{
|
|
"source": "/advanced_features/sglang_for_rl.html",
|
|
"destination": "/docs/advanced_features/sglang_for_rl"
|
|
},
|
|
{
|
|
"source": "/advanced_features/speculative_decoding.html",
|
|
"destination": "/docs/advanced_features/speculative_decoding"
|
|
},
|
|
{
|
|
"source": "/advanced_features/structured_outputs.html",
|
|
"destination": "/docs/advanced_features/structured_outputs"
|
|
},
|
|
{
|
|
"source": "/advanced_features/structured_outputs_for_reasoning_models.html",
|
|
"destination": "/docs/advanced_features/structured_outputs_for_reasoning_models"
|
|
},
|
|
{
|
|
"source": "/advanced_features/tool_parser.html",
|
|
"destination": "/docs/advanced_features/tool_parser"
|
|
},
|
|
{
|
|
"source": "/advanced_features/vlm_query.html",
|
|
"destination": "/docs/advanced_features/vlm_query"
|
|
},
|
|
{
|
|
"source": "/basic_usage/deepseek_ocr.html",
|
|
"destination": "/docs/basic_usage/deepseek_ocr"
|
|
},
|
|
{
|
|
"source": "/basic_usage/deepseek_v3.html",
|
|
"destination": "/docs/basic_usage/deepseek_v3"
|
|
},
|
|
{
|
|
"source": "/basic_usage/deepseek_v32.html",
|
|
"destination": "/docs/basic_usage/deepseek_v32"
|
|
},
|
|
{
|
|
"source": "/basic_usage/glm45.html",
|
|
"destination": "/docs/basic_usage/glm45"
|
|
},
|
|
{
|
|
"source": "/basic_usage/glmv.html",
|
|
"destination": "/docs/basic_usage/glmv"
|
|
},
|
|
{
|
|
"source": "/basic_usage/gpt_oss.html",
|
|
"destination": "/docs/basic_usage/gpt_oss"
|
|
},
|
|
{
|
|
"source": "/basic_usage/llama4.html",
|
|
"destination": "/docs/basic_usage/llama4"
|
|
},
|
|
{
|
|
"source": "/basic_usage/minimax_m2.html",
|
|
"destination": "/docs/basic_usage/minimax_m2"
|
|
},
|
|
{
|
|
"source": "/basic_usage/native_api.html",
|
|
"destination": "/docs/basic_usage/native_api"
|
|
},
|
|
{
|
|
"source": "/basic_usage/offline_engine_api.html",
|
|
"destination": "/docs/basic_usage/offline_engine_api"
|
|
},
|
|
{
|
|
"source": "/basic_usage/ollama_api.html",
|
|
"destination": "/docs/basic_usage/ollama_api"
|
|
},
|
|
{
|
|
"source": "/basic_usage/openai_api.html",
|
|
"destination": "/docs/basic_usage/openai_api"
|
|
},
|
|
{
|
|
"source": "/basic_usage/openai_api_completions.html",
|
|
"destination": "/docs/basic_usage/openai_api_completions"
|
|
},
|
|
{
|
|
"source": "/basic_usage/openai_api_embeddings.html",
|
|
"destination": "/docs/basic_usage/openai_api_embeddings"
|
|
},
|
|
{
|
|
"source": "/basic_usage/openai_api_vision.html",
|
|
"destination": "/docs/basic_usage/openai_api_vision"
|
|
},
|
|
{
|
|
"source": "/basic_usage/popular_model_usage.html",
|
|
"destination": "/docs/basic_usage/popular_model_usage"
|
|
},
|
|
{
|
|
"source": "/basic_usage/qwen3.html",
|
|
"destination": "/docs/basic_usage/qwen3"
|
|
},
|
|
{
|
|
"source": "/basic_usage/qwen3_5.html",
|
|
"destination": "/docs/basic_usage/qwen3_5"
|
|
},
|
|
{
|
|
"source": "/basic_usage/qwen3_vl.html",
|
|
"destination": "/docs/basic_usage/qwen3_vl"
|
|
},
|
|
{
|
|
"source": "/basic_usage/sampling_params.html",
|
|
"destination": "/docs/basic_usage/sampling_params"
|
|
},
|
|
{
|
|
"source": "/basic_usage/send_request.html",
|
|
"destination": "/docs/basic_usage/send_request"
|
|
},
|
|
{
|
|
"source": "/developer_guide/bench_serving.html",
|
|
"destination": "/docs/developer_guide/bench_serving"
|
|
},
|
|
{
|
|
"source": "/developer_guide/benchmark_and_profiling.html",
|
|
"destination": "/docs/developer_guide/benchmark_and_profiling"
|
|
},
|
|
{
|
|
"source": "/developer_guide/contribution_guide.html",
|
|
"destination": "/docs/developer_guide/contribution_guide"
|
|
},
|
|
{
|
|
"source": "/developer_guide/development_guide_using_docker.html",
|
|
"destination": "/docs/developer_guide/development_guide_using_docker"
|
|
},
|
|
{
|
|
"source": "/developer_guide/development_jit_kernel_guide.html",
|
|
"destination": "/docs/developer_guide/development_jit_kernel_guide"
|
|
},
|
|
{
|
|
"source": "/developer_guide/evaluating_new_models.html",
|
|
"destination": "/docs/developer_guide/evaluating_new_models"
|
|
},
|
|
{
|
|
"source": "/developer_guide/release_process.html",
|
|
"destination": "/docs/developer_guide/release_process"
|
|
},
|
|
{
|
|
"source": "/developer_guide/setup_github_runner.html",
|
|
"destination": "/docs/developer_guide/setup_github_runner"
|
|
},
|
|
{
|
|
"source": "/diffusion/api/cli.html",
|
|
"destination": "/docs/sglang-diffusion/api/cli"
|
|
},
|
|
{
|
|
"source": "/diffusion/api/openai_api.html",
|
|
"destination": "/docs/sglang-diffusion/api/openai_api"
|
|
},
|
|
{
|
|
"source": "/diffusion/api/post_processing.html",
|
|
"destination": "/docs/sglang-diffusion/api/post_processing"
|
|
},
|
|
{
|
|
"source": "/diffusion/ci_perf.html",
|
|
"destination": "/docs/sglang-diffusion/ci_perf"
|
|
},
|
|
{
|
|
"source": "/diffusion/compatibility_matrix.html",
|
|
"destination": "/docs/sglang-diffusion/compatibility_matrix"
|
|
},
|
|
{
|
|
"source": "/diffusion/contributing.html",
|
|
"destination": "/docs/sglang-diffusion/contributing"
|
|
},
|
|
{
|
|
"source": "/diffusion/development.html",
|
|
"destination": "/docs/sglang-diffusion/installation"
|
|
},
|
|
{
|
|
"source": "/diffusion/disaggregation.html",
|
|
"destination": "/docs/sglang-diffusion/disaggregation"
|
|
},
|
|
{
|
|
"source": "/diffusion/environment_variables.html",
|
|
"destination": "/docs/sglang-diffusion/environment_variables"
|
|
},
|
|
{
|
|
"source": "/diffusion/index.html",
|
|
"destination": "/docs/sglang-diffusion/index"
|
|
},
|
|
{
|
|
"source": "/diffusion/installation.html",
|
|
"destination": "/docs/sglang-diffusion/installation"
|
|
},
|
|
{
|
|
"source": "/diffusion/performance/attention_backends.html",
|
|
"destination": "/docs/sglang-diffusion/attention_backends"
|
|
},
|
|
{
|
|
"source": "/diffusion/performance/dynamic_batching.html",
|
|
"destination": "/docs/sglang-diffusion/dynamic_batching"
|
|
},
|
|
{
|
|
"source": "/diffusion/performance/cache/cache_dit.html",
|
|
"destination": "/docs/sglang-diffusion/cache_dit"
|
|
},
|
|
{
|
|
"source": "/diffusion/performance/cache/index.html",
|
|
"destination": "/docs/sglang-diffusion/caching-acceleration"
|
|
},
|
|
{
|
|
"source": "/diffusion/performance/cache/teacache.html",
|
|
"destination": "/docs/sglang-diffusion/teacache"
|
|
},
|
|
{
|
|
"source": "/diffusion/performance/index.html",
|
|
"destination": "/docs/sglang-diffusion/performance-optimization"
|
|
},
|
|
{
|
|
"source": "/diffusion/performance/profiling.html",
|
|
"destination": "/docs/sglang-diffusion/profiling"
|
|
},
|
|
{
|
|
"source": "/diffusion/performance/ring_sp_performance.html",
|
|
"destination": "/docs/sglang-diffusion/ring_sp_performance"
|
|
},
|
|
{
|
|
"source": "/diffusion/quantization.html",
|
|
"destination": "/docs/sglang-diffusion/quantization"
|
|
},
|
|
{
|
|
"source": "/diffusion/reference.html",
|
|
"destination": "/docs/sglang-diffusion/installation"
|
|
},
|
|
{
|
|
"source": "/diffusion/support_new_models.html",
|
|
"destination": "/docs/sglang-diffusion/support_new_models"
|
|
},
|
|
{
|
|
"source": "/diffusion/usage.html",
|
|
"destination": "/docs/sglang-diffusion/installation"
|
|
},
|
|
{
|
|
"source": "/get_started/install.html",
|
|
"destination": "/docs/get-started/install"
|
|
},
|
|
{
|
|
"source": "/platforms/amd_gpu.html",
|
|
"destination": "/docs/hardware-platforms/amd_gpu"
|
|
},
|
|
{
|
|
"source": "/platforms/apple_metal.html",
|
|
"destination": "/docs/hardware-platforms/apple_metal"
|
|
},
|
|
{
|
|
"source": "/platforms/ascend/ascend_contribution_guide.html",
|
|
"destination": "/docs/hardware-platforms/ascend-npus/ascend_contribution_guide"
|
|
},
|
|
{
|
|
"source": "/platforms/ascend/ascend_npu.html",
|
|
"destination": "/docs/hardware-platforms/ascend-npus/ascend_npu"
|
|
},
|
|
{
|
|
"source": "/platforms/ascend/ascend_npu_best_practice.html",
|
|
"destination": "/docs/hardware-platforms/ascend-npus/ascend_npu_best_practice"
|
|
},
|
|
{
|
|
"source": "/platforms/ascend/ascend_npu_deepseek_example.html",
|
|
"destination": "/docs/hardware-platforms/ascend-npus/ascend_npu_deepseek_example"
|
|
},
|
|
{
|
|
"source": "/platforms/ascend/ascend_npu_environment_variables.html",
|
|
"destination": "/docs/hardware-platforms/ascend-npus/ascend_npu_environment_variables"
|
|
},
|
|
{
|
|
"source": "/platforms/ascend/ascend_npu_glm5_examples.html",
|
|
"destination": "/docs/hardware-platforms/ascend-npus/ascend_npu_glm5_examples"
|
|
},
|
|
{
|
|
"source": "/platforms/ascend/ascend_npu_quantization.html",
|
|
"destination": "/docs/hardware-platforms/ascend-npus/ascend_npu_quantization"
|
|
},
|
|
{
|
|
"source": "/platforms/ascend/ascend_npu_quick_start.html",
|
|
"destination": "/docs/hardware-platforms/ascend-npus/ascend_npu_quick_start"
|
|
},
|
|
{
|
|
"source": "/platforms/ascend/ascend_npu_qwen3_5_examples.html",
|
|
"destination": "/docs/hardware-platforms/ascend-npus/ascend_npu_qwen3_5_examples"
|
|
},
|
|
{
|
|
"source": "/platforms/ascend/ascend_npu_qwen3_examples.html",
|
|
"destination": "/docs/hardware-platforms/ascend-npus/ascend_npu_qwen3_examples"
|
|
},
|
|
{
|
|
"source": "/platforms/ascend/ascend_npu_support.html",
|
|
"destination": "/docs/hardware-platforms/ascend-npus/ascend_npu_quick_start"
|
|
},
|
|
{
|
|
"source": "/platforms/ascend/ascend_npu_support_features.html",
|
|
"destination": "/docs/hardware-platforms/ascend-npus/ascend_npu_support_features"
|
|
},
|
|
{
|
|
"source": "/platforms/ascend/ascend_npu_support_models.html",
|
|
"destination": "/docs/hardware-platforms/ascend-npus/ascend_npu_support_models"
|
|
},
|
|
{
|
|
"source": "/platforms/ascend/mindspore_backend.html",
|
|
"destination": "/docs/hardware-platforms/ascend-npus/mindspore_backend"
|
|
},
|
|
{
|
|
"source": "/platforms/ascend_npu_ring_sp_performance.html",
|
|
"destination": "/docs/hardware-platforms/ascend-npus/ascend_npu_ring_sp_performance"
|
|
},
|
|
{
|
|
"source": "/platforms/cpu_server.html",
|
|
"destination": "/docs/hardware-platforms/cpu_server"
|
|
},
|
|
{
|
|
"source": "/platforms/mthreads_gpu.html",
|
|
"destination": "/docs/hardware-platforms/mthreads_gpu"
|
|
},
|
|
{
|
|
"source": "/platforms/nvidia_jetson.html",
|
|
"destination": "/docs/hardware-platforms/nvidia_jetson"
|
|
},
|
|
{
|
|
"source": "/platforms/plugin.html",
|
|
"destination": "/docs/hardware-platforms/plugin"
|
|
},
|
|
{
|
|
"source": "/platforms/tpu.html",
|
|
"destination": "/docs/hardware-platforms/tpu"
|
|
},
|
|
{
|
|
"source": "/platforms/xpu.html",
|
|
"destination": "/docs/hardware-platforms/xpu"
|
|
},
|
|
{
|
|
"source": "/references/custom_chat_template.html",
|
|
"destination": "/docs/references/custom_chat_template"
|
|
},
|
|
{
|
|
"source": "/references/environment_variables.html",
|
|
"destination": "/docs/references/environment_variables"
|
|
},
|
|
{
|
|
"source": "/references/faq.html",
|
|
"destination": "/docs/references/faq"
|
|
},
|
|
{
|
|
"source": "/references/frontend/choices_methods.html",
|
|
"destination": "/docs/references/frontend/choices_methods"
|
|
},
|
|
{
|
|
"source": "/references/frontend/frontend_index.html",
|
|
"destination": "/docs/references/frontend/frontend_index"
|
|
},
|
|
{
|
|
"source": "/references/frontend/frontend_tutorial.html",
|
|
"destination": "/docs/references/frontend/frontend_tutorial"
|
|
},
|
|
{
|
|
"source": "/references/learn_more.html",
|
|
"destination": "/"
|
|
},
|
|
{
|
|
"source": "/references/multi_node_deployment/deploy_on_k8s.html",
|
|
"destination": "/docs/references/multi_node_deployment/deploy_on_k8s"
|
|
},
|
|
{
|
|
"source": "/references/multi_node_deployment/lws_pd/lws_pd_deploy.html",
|
|
"destination": "/docs/references/multi_node_deployment/lws_pd/lws_pd_deploy"
|
|
},
|
|
{
|
|
"source": "/references/multi_node_deployment/multi_node.html",
|
|
"destination": "/docs/references/multi_node_deployment/multi_node"
|
|
},
|
|
{
|
|
"source": "/references/multi_node_deployment/multi_node_index.html",
|
|
"destination": "/docs/references/multi_node_deployment/multi_node_index"
|
|
},
|
|
{
|
|
"source": "/references/multi_node_deployment/rbg_pd/deepseekv32_pd.html",
|
|
"destination": "/docs/references/multi_node_deployment/rbg_pd/deepseekv32_pd"
|
|
},
|
|
{
|
|
"source": "/references/post_training_integration.html",
|
|
"destination": "/docs/references/post_training_integration"
|
|
},
|
|
{
|
|
"source": "/references/production_metrics.html",
|
|
"destination": "/docs/references/production_metrics"
|
|
},
|
|
{
|
|
"source": "/references/production_request_trace.html",
|
|
"destination": "/docs/references/production_request_trace"
|
|
},
|
|
{
|
|
"source": "/references/release_lookup.html",
|
|
"destination": "/docs/references/overview"
|
|
},
|
|
{
|
|
"source": "/references/torch_compile_cache.html",
|
|
"destination": "/docs/references/torch_compile_cache"
|
|
},
|
|
{
|
|
"source": "/supported_models/extending/index.html",
|
|
"destination": "/docs/supported-models"
|
|
},
|
|
{
|
|
"source": "/supported_models/extending/mindspore_models.html",
|
|
"destination": "/docs/supported-models/mindspore_models"
|
|
},
|
|
{
|
|
"source": "/supported_models/extending/modelscope.html",
|
|
"destination": "/docs/supported-models/modelscope"
|
|
},
|
|
{
|
|
"source": "/supported_models/extending/support_new_models.html",
|
|
"destination": "/docs/supported-models/support_new_models"
|
|
},
|
|
{
|
|
"source": "/supported_models/extending/transformers_fallback.html",
|
|
"destination": "/docs/supported-models/transformers_fallback"
|
|
},
|
|
{
|
|
"source": "/supported_models/index.html",
|
|
"destination": "/docs/supported-models"
|
|
},
|
|
{
|
|
"source": "/supported_models/retrieval_ranking/classify_models.html",
|
|
"destination": "/docs/supported-models/classify_models"
|
|
},
|
|
{
|
|
"source": "/supported_models/retrieval_ranking/embedding_models.html",
|
|
"destination": "/docs/supported-models/embedding_models"
|
|
},
|
|
{
|
|
"source": "/supported_models/retrieval_ranking/index.html",
|
|
"destination": "/docs/supported-models"
|
|
},
|
|
{
|
|
"source": "/supported_models/retrieval_ranking/rerank_models.html",
|
|
"destination": "/docs/supported-models/rerank_models"
|
|
},
|
|
{
|
|
"source": "/supported_models/specialized/index.html",
|
|
"destination": "/docs/supported-models"
|
|
},
|
|
{
|
|
"source": "/supported_models/specialized/reward_models.html",
|
|
"destination": "/docs/supported-models/reward_models"
|
|
},
|
|
{
|
|
"source": "/supported_models/text_generation/diffusion_language_models.html",
|
|
"destination": "/docs/supported-models/diffusion_language_models"
|
|
},
|
|
{
|
|
"source": "/supported_models/text_generation/generative_models.html",
|
|
"destination": "/docs/supported-models/generative_models"
|
|
},
|
|
{
|
|
"source": "/supported_models/text_generation/index.html",
|
|
"destination": "/docs/supported-models"
|
|
},
|
|
{
|
|
"source": "/supported_models/text_generation/multimodal_language_models.html",
|
|
"destination": "/docs/supported-models/multimodal_language_models"
|
|
},
|
|
{
|
|
"source": "/supported_models.html",
|
|
"destination": "/docs/supported-models"
|
|
},
|
|
{
|
|
"source": "/diffusion.html",
|
|
"destination": "/docs/sglang-diffusion/index"
|
|
}
|
|
],
|
|
"colors": {
|
|
"primary": "#d55816",
|
|
"light": "#d55816",
|
|
"dark": "#d55816"
|
|
},
|
|
"background": {
|
|
"decoration": "grid",
|
|
"color": {
|
|
"dark": "#1d1d1d",
|
|
"light": "#fffcfb"
|
|
}
|
|
},
|
|
"fonts": {
|
|
"heading": {
|
|
"family": "Inter",
|
|
"weight": 600
|
|
},
|
|
"body": {
|
|
"family": "Inter",
|
|
"weight": 400
|
|
}
|
|
},
|
|
"favicon": "/favicon.png",
|
|
"navigation": {
|
|
"tabs": [
|
|
{
|
|
"tab": "Get Started",
|
|
"groups": [
|
|
{
|
|
"group": "Get Started",
|
|
"icon": "play",
|
|
"pages": [
|
|
"index",
|
|
"docs/get-started/install",
|
|
"docs/get-started/quickstart",
|
|
"docs/basic_usage/send_request"
|
|
]
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tab": "User Guide",
|
|
"groups": [
|
|
{
|
|
"group": "Basic Usage",
|
|
"icon": "book-open",
|
|
"pages": [
|
|
"docs/basic_usage/overview",
|
|
{
|
|
"group": "OpenAI-Compatible APIs",
|
|
"pages": [
|
|
"docs/basic_usage/openai_api",
|
|
"docs/basic_usage/openai_api_completions",
|
|
"docs/basic_usage/openai_api_vision",
|
|
"docs/basic_usage/openai_api_embeddings"
|
|
]
|
|
},
|
|
"docs/basic_usage/ollama_api",
|
|
"docs/basic_usage/offline_engine_api",
|
|
"docs/basic_usage/native_api",
|
|
"docs/basic_usage/sampling_params",
|
|
{
|
|
"group": "Popular Model Usage",
|
|
"pages": [
|
|
"docs/basic_usage/popular_model_usage",
|
|
"docs/basic_usage/deepseek_v3",
|
|
"docs/basic_usage/deepseek_v32",
|
|
"docs/basic_usage/deepseek_ocr",
|
|
"docs/basic_usage/glm45",
|
|
"docs/basic_usage/glmv",
|
|
"docs/basic_usage/gpt_oss",
|
|
"docs/basic_usage/kimi_k2_5",
|
|
"docs/basic_usage/minimax_m2",
|
|
"docs/basic_usage/qwen3",
|
|
"docs/basic_usage/qwen3_5",
|
|
"docs/basic_usage/qwen3_vl",
|
|
"docs/basic_usage/llama4"
|
|
]
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"group": "Advanced Features",
|
|
"icon": "gears",
|
|
"pages": [
|
|
"docs/advanced_features/overview",
|
|
"docs/advanced_features/server_arguments",
|
|
"docs/advanced_features/object_storage",
|
|
"docs/advanced_features/hyperparameter_tuning",
|
|
"docs/advanced_features/attention_backend",
|
|
"docs/advanced_features/hisparse_guide",
|
|
"docs/advanced_features/speculative_decoding",
|
|
"docs/advanced_features/adaptive_speculative_decoding",
|
|
"docs/advanced_features/structured_outputs",
|
|
"docs/advanced_features/structured_outputs_for_reasoning_models",
|
|
"docs/advanced_features/tool_parser",
|
|
"docs/advanced_features/separate_reasoning",
|
|
"docs/advanced_features/quantization",
|
|
"docs/advanced_features/quantized_kv_cache",
|
|
"docs/advanced_features/dp_dpa_smg_guide",
|
|
"docs/advanced_features/expert_parallelism",
|
|
"docs/advanced_features/lora",
|
|
"docs/advanced_features/pd_disaggregation",
|
|
"docs/advanced_features/epd_disaggregation",
|
|
"docs/advanced_features/pipeline_parallelism",
|
|
{
|
|
"group": "Hierarchical KV Caching (HiCache)",
|
|
"pages": [
|
|
"docs/advanced_features/hicache",
|
|
"docs/advanced_features/hicache_best_practices",
|
|
"docs/advanced_features/hicache_design",
|
|
"docs/advanced_features/hicache_storage_runtime_attach_detach"
|
|
]
|
|
},
|
|
"docs/advanced_features/vlm_query",
|
|
"docs/advanced_features/dp_for_multi_modal_encoder",
|
|
"docs/advanced_features/cuda_graph_for_multi_modal_encoder",
|
|
"docs/advanced_features/breakable_cuda_graph",
|
|
"docs/advanced_features/piecewise_cuda_graph",
|
|
"docs/advanced_features/sgl_model_gateway",
|
|
"docs/advanced_features/deterministic_inference",
|
|
"docs/advanced_features/observability",
|
|
"docs/advanced_features/checkpoint_engine",
|
|
"docs/advanced_features/sglang_for_rl"
|
|
]
|
|
},
|
|
{
|
|
"group": "Supported Models",
|
|
"icon": "cubes",
|
|
"pages": [
|
|
"docs/supported-models",
|
|
{
|
|
"group": "Text Generation",
|
|
"pages": [
|
|
"docs/supported-models/generative_models",
|
|
"docs/supported-models/multimodal_language_models",
|
|
"docs/supported-models/diffusion_language_models"
|
|
]
|
|
},
|
|
{
|
|
"group": "Retrieval and Ranking",
|
|
"pages": [
|
|
"docs/supported-models/embedding_models",
|
|
"docs/supported-models/rerank_models",
|
|
"docs/supported-models/classify_models"
|
|
]
|
|
},
|
|
{
|
|
"group": "Specialized Models",
|
|
"pages": [
|
|
"docs/supported-models/reward_models"
|
|
]
|
|
},
|
|
{
|
|
"group": "Extending SGLang",
|
|
"pages": [
|
|
"docs/supported-models/support_new_models",
|
|
"docs/supported-models/transformers_fallback",
|
|
"docs/supported-models/modelscope",
|
|
"docs/supported-models/mindspore_models"
|
|
]
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"group": "Developer Guide",
|
|
"icon": "code",
|
|
"pages": [
|
|
"docs/developer_guide/overview",
|
|
"docs/developer_guide/contribution_guide",
|
|
{
|
|
"group": "Development",
|
|
"pages": [
|
|
"docs/developer_guide/development_guide_using_docker",
|
|
"docs/developer_guide/development_jit_kernel_guide"
|
|
]
|
|
},
|
|
{
|
|
"group": "Benchmarking",
|
|
"pages": [
|
|
"docs/developer_guide/benchmark_and_profiling",
|
|
"docs/developer_guide/bench_serving"
|
|
]
|
|
},
|
|
"docs/developer_guide/evaluating_new_models",
|
|
"docs/developer_guide/msprobe_debugging_guide"
|
|
]
|
|
},
|
|
{
|
|
"group": "References",
|
|
"icon": "bookmark",
|
|
"pages": [
|
|
"docs/references/overview",
|
|
"docs/references/faq",
|
|
"docs/references/environment_variables",
|
|
"docs/references/production_metrics",
|
|
"docs/references/production_request_trace",
|
|
{
|
|
"group": "Multi-Node Deployment",
|
|
"pages": [
|
|
"docs/references/multi_node_deployment/multi_node_index",
|
|
"docs/references/multi_node_deployment/multi_node",
|
|
"docs/references/multi_node_deployment/deploy_on_k8s",
|
|
"docs/references/multi_node_deployment/lws_pd/lws_pd_deploy",
|
|
"docs/references/multi_node_deployment/rbg_pd/deepseekv32_pd"
|
|
]
|
|
},
|
|
"docs/references/custom_chat_template",
|
|
{
|
|
"group": "Frontend Language",
|
|
"pages": [
|
|
"docs/references/frontend/frontend_index",
|
|
"docs/references/frontend/frontend_tutorial",
|
|
"docs/references/frontend/choices_methods"
|
|
]
|
|
},
|
|
{
|
|
"group": "Cookbook",
|
|
"pages": [
|
|
"cookbook/base/reference/server_arguments"
|
|
]
|
|
},
|
|
"docs/references/post_training_integration"
|
|
]
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tab": "Hardware",
|
|
"groups": [
|
|
{
|
|
"group": "Hardware Platforms",
|
|
"icon": "microchip",
|
|
"pages": [
|
|
"docs/hardware-platforms/overview",
|
|
"docs/hardware-platforms/nvidia-gpus",
|
|
"docs/hardware-platforms/amd_gpu",
|
|
"docs/hardware-platforms/apple_metal",
|
|
{
|
|
"group": "Ascend NPUs",
|
|
"pages": [
|
|
"docs/hardware-platforms/ascend-npus/ascend_npu_quick_start",
|
|
"docs/hardware-platforms/ascend-npus/ascend_npu",
|
|
"docs/hardware-platforms/ascend-npus/ascend_npu_support_features",
|
|
"docs/hardware-platforms/ascend-npus/ascend_npu_support_models",
|
|
"docs/hardware-platforms/ascend-npus/ascend_npu_quantization",
|
|
"docs/hardware-platforms/ascend-npus/ascend_npu_deepseek_example",
|
|
"docs/hardware-platforms/ascend-npus/ascend_npu_qwen3_examples",
|
|
"docs/hardware-platforms/ascend-npus/mindspore_backend",
|
|
"docs/hardware-platforms/ascend-npus/ascend_contribution_guide",
|
|
"docs/hardware-platforms/ascend-npus/ascend_npu_support_new_models",
|
|
"docs/hardware-platforms/ascend-npus/ascend_npu_best_practice",
|
|
"docs/hardware-platforms/ascend-npus/ascend_npu_ring_sp_performance",
|
|
"docs/hardware-platforms/ascend-npus/ascend_npu_qwen3_5_examples",
|
|
"docs/hardware-platforms/ascend-npus/ascend_npu_glm5_examples",
|
|
"docs/hardware-platforms/ascend-npus/ascend_npu_environment_variables"
|
|
]
|
|
},
|
|
"docs/hardware-platforms/cpu_server",
|
|
{
|
|
"group": "Edge & Embedded",
|
|
"pages": [
|
|
"docs/hardware-platforms/nvidia_jetson"
|
|
]
|
|
},
|
|
"docs/hardware-platforms/mthreads_gpu",
|
|
"docs/hardware-platforms/tpu",
|
|
"docs/hardware-platforms/xpu",
|
|
"docs/hardware-platforms/plugin"
|
|
]
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tab": "Cookbook",
|
|
"groups": [
|
|
{
|
|
"group": "Cookbook",
|
|
"icon": "book",
|
|
"pages": [
|
|
"cookbook/intro",
|
|
{
|
|
"group": "Autoregressive Models",
|
|
"pages": [
|
|
"cookbook/autoregressive/intro",
|
|
{
|
|
"group": "Qwen",
|
|
"pages": [
|
|
"cookbook/autoregressive/Qwen/Qwen3.6",
|
|
"cookbook/autoregressive/Qwen/Qwen3.5",
|
|
"cookbook/autoregressive/Qwen/Qwen3",
|
|
"cookbook/autoregressive/Qwen/Qwen3-Next",
|
|
"cookbook/autoregressive/Qwen/Qwen3-Coder",
|
|
"cookbook/autoregressive/Qwen/Qwen3-Coder-Next",
|
|
"cookbook/autoregressive/Qwen/Qwen3-VL",
|
|
"cookbook/autoregressive/Qwen/Qwen2.5-VL"
|
|
]
|
|
},
|
|
{
|
|
"group": "DeepSeek",
|
|
"pages": [
|
|
"cookbook/autoregressive/DeepSeek/DeepSeek-V4",
|
|
"cookbook/autoregressive/DeepSeek/DeepSeek-V3_2",
|
|
"cookbook/autoregressive/DeepSeek/DeepSeek-V3_1",
|
|
"cookbook/autoregressive/DeepSeek/DeepSeek-V3",
|
|
"cookbook/autoregressive/DeepSeek/DeepSeek-R1",
|
|
"cookbook/autoregressive/DeepSeek/DeepSeek-Math-V2",
|
|
"cookbook/autoregressive/DeepSeek/DeepSeek-OCR",
|
|
"cookbook/autoregressive/DeepSeek/DeepSeek-OCR-2"
|
|
]
|
|
},
|
|
{
|
|
"group": "Llama",
|
|
"pages": [
|
|
"cookbook/autoregressive/Llama/Llama4",
|
|
"cookbook/autoregressive/Llama/Llama3.3-70B",
|
|
"cookbook/autoregressive/Llama/Llama3.1"
|
|
]
|
|
},
|
|
{
|
|
"group": "GLM",
|
|
"pages": [
|
|
"cookbook/autoregressive/GLM/GLM-5.1",
|
|
"cookbook/autoregressive/GLM/GLM-5",
|
|
"cookbook/autoregressive/GLM/GLM-OCR",
|
|
"cookbook/autoregressive/GLM/GLM-Glyph",
|
|
"cookbook/autoregressive/GLM/GLM-4.7",
|
|
"cookbook/autoregressive/GLM/GLM-4.7-Flash",
|
|
"cookbook/autoregressive/GLM/GLM-4.6",
|
|
"cookbook/autoregressive/GLM/GLM-4.6V",
|
|
"cookbook/autoregressive/GLM/GLM-4.5",
|
|
"cookbook/autoregressive/GLM/GLM-4.5V"
|
|
]
|
|
},
|
|
{
|
|
"group": "Google",
|
|
"pages": [
|
|
"cookbook/autoregressive/Google/Gemma4"
|
|
]
|
|
},
|
|
{
|
|
"group": "OpenAI",
|
|
"pages": [
|
|
"cookbook/autoregressive/OpenAI/GPT-OSS"
|
|
]
|
|
},
|
|
{
|
|
"group": "Moonshotai",
|
|
"pages": [
|
|
"cookbook/autoregressive/Moonshotai/Kimi-K2.6",
|
|
"cookbook/autoregressive/Moonshotai/Kimi-K2.5",
|
|
"cookbook/autoregressive/Moonshotai/Kimi-K2",
|
|
"cookbook/autoregressive/Moonshotai/Kimi-Linear"
|
|
]
|
|
},
|
|
{
|
|
"group": "MiniMax",
|
|
"pages": [
|
|
"cookbook/autoregressive/MiniMax/MiniMax-M2.7",
|
|
"cookbook/autoregressive/MiniMax/MiniMax-M2",
|
|
"cookbook/autoregressive/MiniMax/MiniMax-M2.5"
|
|
]
|
|
},
|
|
{
|
|
"group": "NVIDIA",
|
|
"pages": [
|
|
"cookbook/autoregressive/NVIDIA/Nemotron3-Nano-Omni",
|
|
"cookbook/autoregressive/NVIDIA/Nemotron3-Nano",
|
|
"cookbook/autoregressive/NVIDIA/Nemotron3-Super"
|
|
]
|
|
},
|
|
{
|
|
"group": "Ernie",
|
|
"pages": [
|
|
"cookbook/autoregressive/Ernie/Ernie4.5",
|
|
"cookbook/autoregressive/Ernie/Ernie4.5-VL"
|
|
]
|
|
},
|
|
{
|
|
"group": "StepFun",
|
|
"pages": [
|
|
"cookbook/autoregressive/StepFun/Step3.5",
|
|
"cookbook/autoregressive/StepFun/Step3-VL-10B"
|
|
]
|
|
},
|
|
{
|
|
"group": "InclusionAI",
|
|
"pages": [
|
|
"cookbook/autoregressive/InclusionAI/Ling-2.6",
|
|
"cookbook/autoregressive/InclusionAI/Ling-2.5-1T",
|
|
"cookbook/autoregressive/InclusionAI/Ring-2.5-1T",
|
|
"cookbook/autoregressive/InclusionAI/LLaDA-2.1"
|
|
]
|
|
},
|
|
{
|
|
"group": "InternLM",
|
|
"pages": [
|
|
"cookbook/autoregressive/InternLM/Intern-S1"
|
|
]
|
|
},
|
|
{
|
|
"group": "InternVL",
|
|
"pages": [
|
|
"cookbook/autoregressive/InternVL/InternVL3.5"
|
|
]
|
|
},
|
|
{
|
|
"group": "Jina AI",
|
|
"pages": [
|
|
"cookbook/autoregressive/Jina/Jina-reranker-m0"
|
|
]
|
|
},
|
|
{
|
|
"group": "Mistral",
|
|
"pages": [
|
|
"cookbook/autoregressive/Mistral/Ministral-3",
|
|
"cookbook/autoregressive/Mistral/Mistral-Small-4",
|
|
"cookbook/autoregressive/Mistral/Mistral-Medium-3.5",
|
|
"cookbook/autoregressive/Mistral/Devstral-2"
|
|
]
|
|
},
|
|
{
|
|
"group": "Xiaomi",
|
|
"pages": [
|
|
"cookbook/autoregressive/Xiaomi/MiMo-V2.5",
|
|
"cookbook/autoregressive/Xiaomi/MiMo-V2-Flash"
|
|
]
|
|
},
|
|
{
|
|
"group": "FlashLabs",
|
|
"pages": [
|
|
"cookbook/autoregressive/FlashLabs/Chroma1.0"
|
|
]
|
|
},
|
|
{
|
|
"group": "Tencent",
|
|
"pages": [
|
|
"cookbook/autoregressive/Tencent/Hunyuan3-Preview"
|
|
]
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"group": "Diffusion Models",
|
|
"pages": [
|
|
"cookbook/diffusion/intro",
|
|
{
|
|
"group": "FLUX",
|
|
"pages": [
|
|
"cookbook/diffusion/FLUX/FLUX"
|
|
]
|
|
},
|
|
{
|
|
"group": "Wan",
|
|
"pages": [
|
|
"cookbook/diffusion/Wan/Wan2.1",
|
|
"cookbook/diffusion/Wan/Wan2.2"
|
|
]
|
|
},
|
|
{
|
|
"group": "LTX",
|
|
"pages": [
|
|
"cookbook/diffusion/LTX/LTX2 & LTX2.3"
|
|
]
|
|
},
|
|
{
|
|
"group": "Qwen-Image",
|
|
"pages": [
|
|
"cookbook/diffusion/Qwen-Image/Qwen-Image",
|
|
"cookbook/diffusion/Qwen-Image/Qwen-Image-Edit"
|
|
]
|
|
},
|
|
{
|
|
"group": "Z-Image",
|
|
"pages": [
|
|
"cookbook/diffusion/Z-Image/Z-Image-Turbo"
|
|
]
|
|
},
|
|
{
|
|
"group": "MOVA",
|
|
"pages": [
|
|
"cookbook/diffusion/MOVA/MOVA"
|
|
]
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"group": "SpecBundle",
|
|
"pages": [
|
|
"cookbook/specbundle/supported_models",
|
|
"cookbook/specbundle/specbundle_usage"
|
|
]
|
|
},
|
|
{
|
|
"group": "Benchmarks",
|
|
"pages": [
|
|
"cookbook/base/benchmarks/autoregressive_model_benchmark",
|
|
"cookbook/base/benchmarks/diffusion_model_benchmark"
|
|
]
|
|
}
|
|
]
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tab": "SGLang Diffusion",
|
|
"groups": [
|
|
{
|
|
"group": "SGLang Diffusion",
|
|
"icon": "sparkles",
|
|
"pages": [
|
|
"docs/sglang-diffusion/index",
|
|
"docs/sglang-diffusion/installation",
|
|
"docs/sglang-diffusion/compatibility_matrix",
|
|
"docs/sglang-diffusion/disaggregation",
|
|
"docs/sglang-diffusion/quantization",
|
|
{
|
|
"group": "Usage",
|
|
"pages": [
|
|
"docs/sglang-diffusion/api/cli",
|
|
"docs/sglang-diffusion/api/openai_api",
|
|
"docs/sglang-diffusion/api/post_processing"
|
|
]
|
|
},
|
|
{
|
|
"group": "Performance Optimization",
|
|
"pages": [
|
|
"docs/sglang-diffusion/performance-optimization",
|
|
"docs/sglang-diffusion/ring_sp_performance",
|
|
"docs/sglang-diffusion/attention_backends",
|
|
{
|
|
"group": "Inference Batching",
|
|
"pages": [
|
|
"docs/sglang-diffusion/dynamic_batching"
|
|
]
|
|
},
|
|
"docs/sglang-diffusion/profiling",
|
|
"docs/sglang-diffusion/ci_perf"
|
|
]
|
|
},
|
|
{
|
|
"group": "Caching Strategies",
|
|
"pages": [
|
|
"docs/sglang-diffusion/caching-acceleration",
|
|
"docs/sglang-diffusion/cache_dit",
|
|
"docs/sglang-diffusion/teacache"
|
|
]
|
|
},
|
|
{
|
|
"group": "References",
|
|
"pages": [
|
|
"docs/sglang-diffusion/environment_variables",
|
|
"docs/sglang-diffusion/support_new_models",
|
|
"docs/sglang-diffusion/contributing"
|
|
]
|
|
}
|
|
]
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"global": {
|
|
"anchors": []
|
|
}
|
|
},
|
|
"logo": {
|
|
"light": "/logo/logo.png",
|
|
"dark": "/logo/logo.png"
|
|
},
|
|
"contextual": {
|
|
"options": [
|
|
"copy",
|
|
"view",
|
|
"chatgpt",
|
|
"claude",
|
|
"perplexity",
|
|
"mcp",
|
|
"cursor",
|
|
"vscode"
|
|
]
|
|
},
|
|
"footer": {
|
|
"socials": {
|
|
"github": "https://github.com/sgl-project/sglang",
|
|
"x": "https://x.com/lmsysorg",
|
|
"linkedin": "https://www.linkedin.com/company/sgl-project/posts?feedView=all",
|
|
"slack": "https://slack.sglang.io/",
|
|
"discord": "https://discord.gg/4ugb2t6YY2"
|
|
}
|
|
}
|
|
}
|