mirror of
https://github.com/kvcache-ai/sglang.git
synced 2026-06-30 19:57:52 +00:00
Co-authored-by: AdityaVKochar <adityavardhankochar@gmail.com> Co-authored-by: mintlify[bot] <109931778+mintlify[bot]@users.noreply.github.com> Co-authored-by: adhyan-jain <adhyanjain2006@gmail.com> Co-authored-by: Adhyan Jain <71976554+adhyan-jain@users.noreply.github.com> Co-authored-by: Maitri-shah29 <maitrirajivshah@gmail.com> Co-authored-by: Adarsh Shirawalmath <114558126+adarshxs@users.noreply.github.com> Co-authored-by: Maitri Shah <shah29maitri@gmail.com> Co-authored-by: Aditya Vardhan Kochar <80113212+AdityaVKochar@users.noreply.github.com> Co-authored-by: Rishit Shivam <164783543+pokymono@users.noreply.github.com> Co-authored-by: Rishitshivam <164783543+Rishitshivam@users.noreply.github.com> Co-authored-by: IshhanKheria <ishhankheria06@gmail.com> Co-authored-by: Ishita Joshi <ishitata.joshi@gmail.com> Co-authored-by: Richard Chen <104477092+Richardczl98@users.noreply.github.com> Co-authored-by: longGGGGGG <553746008@qq.com> Co-authored-by: Richard <richardchen@radixark.ai> Co-authored-by: Nakul Sinha <nakul.new4socials@gmail.com> Co-authored-by: Divyam Agrawal <ludicrouslytrue@gmail.com> Co-authored-by: Richardczl98 <Zhenlinc@stanford.edu> Co-authored-by: Krishang Zinzuwadia <krishangzinzuwadia@gmail.com> Co-authored-by: nimeshas <nimesha.s106@gmail.com> Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com> Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com> Co-authored-by: Jignas Paturu <86356085+JignasP@users.noreply.github.com> Co-authored-by: zijiexia <37504505+zijiexia@users.noreply.github.com>
173 lines
4.3 KiB
Plaintext
173 lines
4.3 KiB
Plaintext
---
|
|
title: "Tutorial: Sending a request"
|
|
metatags:
|
|
description: "This notebook provides a quick-start guide to use SGLang in chat completions after installation. "
|
|
---
|
|
This notebook provides a quick-start guide to use SGLang in chat completions after installation. Once your server is running, API documentation is available at `http://localhost:30000/docs` (Swagger UI), `http://localhost:30000/redoc` (ReDoc), or `http://localhost:30000/openapi.json` (OpenAPI spec, useful for AI agents). Replace `30000` with your port if using a different one.
|
|
|
|
- For Vision Language Models, see [OpenAI APIs - Vision](./openai_api_vision).
|
|
- For Embedding Models, see [OpenAI APIs - Embedding](./openai_api_embeddings) and [Encode (embedding model)](./native_api#encode-embedding-model).
|
|
- For Reward Models, see [Classify (reward model)](./native_api#classify-reward-model).
|
|
|
|
|
|
## Launch A Server
|
|
|
|
|
|
|
|
```python Example
|
|
from sglang.test.doc_patch import launch_server_cmd
|
|
from sglang.utils import wait_for_server, print_highlight, terminate_process
|
|
|
|
# This is equivalent to running the following command in your terminal
|
|
# python3 -m sglang.launch_server --model-path qwen/qwen2.5-0.5b-instruct --host 0.0.0.0
|
|
|
|
server_process, port = launch_server_cmd(
|
|
"""
|
|
python3 -m sglang.launch_server --model-path qwen/qwen2.5-0.5b-instruct \
|
|
--host 0.0.0.0 --log-level warning
|
|
"""
|
|
)
|
|
|
|
wait_for_server(f"http://localhost:{port}")
|
|
```
|
|
|
|
## Using cURL
|
|
|
|
|
|
|
|
|
|
```python Example
|
|
import subprocess, json
|
|
|
|
curl_command = f"""
|
|
curl -s http://localhost:{port}/v1/chat/completions \
|
|
-H "Content-Type: application/json" \
|
|
-d '{{"model": "qwen/qwen2.5-0.5b-instruct", "messages": [{{"role": "user", "content": "What is the capital of France?"}}]}}'
|
|
"""
|
|
|
|
response = json.loads(subprocess.check_output(curl_command, shell=True))
|
|
print_highlight(response)
|
|
```
|
|
|
|
## Using Python Requests
|
|
|
|
|
|
|
|
```python Example
|
|
import requests
|
|
|
|
url = f"http://localhost:{port}/v1/chat/completions"
|
|
|
|
data = {
|
|
"model": "qwen/qwen2.5-0.5b-instruct",
|
|
"messages": [{"role": "user", "content": "What is the capital of France?"}],
|
|
}
|
|
|
|
response = requests.post(url, json=data)
|
|
print_highlight(response.json())
|
|
```
|
|
|
|
## Using OpenAI Python Client
|
|
|
|
|
|
|
|
```python Example
|
|
import openai
|
|
|
|
client = openai.Client(base_url=f"http://127.0.0.1:{port}/v1", api_key="None")
|
|
|
|
response = client.chat.completions.create(
|
|
model="qwen/qwen2.5-0.5b-instruct",
|
|
messages=[
|
|
{"role": "user", "content": "List 3 countries and their capitals."},
|
|
],
|
|
temperature=0,
|
|
max_tokens=64,
|
|
)
|
|
print_highlight(response)
|
|
```
|
|
|
|
### Streaming
|
|
|
|
|
|
|
|
```python Example
|
|
import openai
|
|
|
|
client = openai.Client(base_url=f"http://127.0.0.1:{port}/v1", api_key="None")
|
|
|
|
# Use stream=True for streaming responses
|
|
response = client.chat.completions.create(
|
|
model="qwen/qwen2.5-0.5b-instruct",
|
|
messages=[
|
|
{"role": "user", "content": "List 3 countries and their capitals."},
|
|
],
|
|
temperature=0,
|
|
max_tokens=64,
|
|
stream=True,
|
|
)
|
|
|
|
# Handle the streaming output
|
|
for chunk in response:
|
|
if chunk.choices[0].delta.content:
|
|
print(chunk.choices[0].delta.content, end="", flush=True)
|
|
```
|
|
|
|
## Using Native Generation APIs
|
|
|
|
You can also use the native `/generate` endpoint with requests, which provides more flexibility. An API reference is available at [Sampling Parameters](./sampling_params).
|
|
|
|
|
|
|
|
```python Example
|
|
import requests
|
|
|
|
response = requests.post(
|
|
f"http://localhost:{port}/generate",
|
|
json={
|
|
"text": "The capital of France is",
|
|
"sampling_params": {
|
|
"temperature": 0,
|
|
"max_new_tokens": 32,
|
|
},
|
|
},
|
|
)
|
|
|
|
print_highlight(response.json())
|
|
```
|
|
### Streaming
|
|
|
|
|
|
|
|
```python Example
|
|
import requests, json
|
|
|
|
response = requests.post(
|
|
f"http://localhost:{port}/generate",
|
|
json={
|
|
"text": "The capital of France is",
|
|
"sampling_params": {
|
|
"temperature": 0,
|
|
"max_new_tokens": 32,
|
|
},
|
|
"stream": True,
|
|
},
|
|
stream=True,
|
|
)
|
|
|
|
prev = 0
|
|
for chunk in response.iter_lines(decode_unicode=False):
|
|
chunk = chunk.decode("utf-8")
|
|
if chunk and chunk.startswith("data:"):
|
|
if chunk == "data: [DONE]":
|
|
break
|
|
data = json.loads(chunk[5:].strip("\n"))
|
|
output = data["text"]
|
|
print(output[prev:], end="", flush=True)
|
|
prev = len(output)
|
|
```
|
|
|
|
```python Example
|
|
terminate_process(server_process)
|
|
```
|