mirror of
https://github.com/kvcache-ai/sglang.git
synced 2026-06-30 19:57:52 +00:00
Co-authored-by: AdityaVKochar <adityavardhankochar@gmail.com> Co-authored-by: mintlify[bot] <109931778+mintlify[bot]@users.noreply.github.com> Co-authored-by: adhyan-jain <adhyanjain2006@gmail.com> Co-authored-by: Adhyan Jain <71976554+adhyan-jain@users.noreply.github.com> Co-authored-by: Maitri-shah29 <maitrirajivshah@gmail.com> Co-authored-by: Adarsh Shirawalmath <114558126+adarshxs@users.noreply.github.com> Co-authored-by: Maitri Shah <shah29maitri@gmail.com> Co-authored-by: Aditya Vardhan Kochar <80113212+AdityaVKochar@users.noreply.github.com> Co-authored-by: Rishit Shivam <164783543+pokymono@users.noreply.github.com> Co-authored-by: Rishitshivam <164783543+Rishitshivam@users.noreply.github.com> Co-authored-by: IshhanKheria <ishhankheria06@gmail.com> Co-authored-by: Ishita Joshi <ishitata.joshi@gmail.com> Co-authored-by: Richard Chen <104477092+Richardczl98@users.noreply.github.com> Co-authored-by: longGGGGGG <553746008@qq.com> Co-authored-by: Richard <richardchen@radixark.ai> Co-authored-by: Nakul Sinha <nakul.new4socials@gmail.com> Co-authored-by: Divyam Agrawal <ludicrouslytrue@gmail.com> Co-authored-by: Richardczl98 <Zhenlinc@stanford.edu> Co-authored-by: Krishang Zinzuwadia <krishangzinzuwadia@gmail.com> Co-authored-by: nimeshas <nimesha.s106@gmail.com> Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com> Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com> Co-authored-by: Jignas Paturu <86356085+JignasP@users.noreply.github.com> Co-authored-by: zijiexia <37504505+zijiexia@users.noreply.github.com>
804 lines
22 KiB
Plaintext
804 lines
22 KiB
Plaintext
---
|
|
title: "Structured Outputs"
|
|
metatags:
|
|
description: "SGLang structured outputs: JSON schema, regex, EBNF constraints. XGrammar, Outlines, Llguidance backends for guaranteed output format."
|
|
---
|
|
You can specify a JSON schema, [regular expression](https://en.wikipedia.org/wiki/Regular_expression) or [EBNF](https://en.wikipedia.org/wiki/Extended_Backus%E2%80%93Naur_form) to constrain the model output. The model output will be guaranteed to follow the given constraints. Only one constraint parameter (`json_schema`, `regex`, or `ebnf`) can be specified for a request.
|
|
|
|
SGLang supports three grammar backends:
|
|
|
|
- [XGrammar](https://github.com/mlc-ai/xgrammar)(default): Supports JSON schema, regular expression, and EBNF constraints.
|
|
- [Outlines](https://github.com/dottxt-ai/outlines): Supports JSON schema and regular expression constraints.
|
|
- [Llguidance](https://github.com/guidance-ai/llguidance): Supports JSON schema, regular expression, and EBNF constraints.
|
|
|
|
We suggest using XGrammar for its better performance and utility. XGrammar currently uses the [GGML BNF format](https://github.com/ggerganov/llama.cpp/blob/master/grammars/README). For more details, see [XGrammar technical overview](https://blog.mlc.ai/2024/11/22/achieving-efficient-flexible-portable-structured-generation-with-xgrammar).
|
|
|
|
To use Outlines, simply add `--grammar-backend outlines` when launching the server.
|
|
To use llguidance, add `--grammar-backend llguidance` when launching the server.
|
|
If no backend is specified, XGrammar will be used as the default.
|
|
|
|
For better output quality, **It's advisable to explicitly include instructions in the prompt to guide the model to generate the desired format.** For example, you can specify, 'Please generate the output in the following JSON format: ...'.
|
|
|
|
|
|
|
|
## OpenAI Compatible API
|
|
|
|
|
|
|
|
```python Example
|
|
import openai
|
|
import os
|
|
|
|
from sglang.test.doc_patch import launch_server_cmd
|
|
from sglang.utils import wait_for_server, print_highlight, terminate_process
|
|
|
|
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
|
|
|
|
|
server_process, port = launch_server_cmd(
|
|
"python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --host 0.0.0.0 --log-level warning"
|
|
)
|
|
|
|
wait_for_server(f"http://localhost:{port}")
|
|
client = openai.Client(base_url=f"http://127.0.0.1:{port}/v1", api_key="None")
|
|
```
|
|
|
|
### JSON
|
|
|
|
you can directly define a JSON schema or use [Pydantic](https://docs.pydantic.dev/latest/) to define and validate the response.
|
|
|
|
|
|
**Using Pydantic**
|
|
|
|
|
|
|
|
```python Example
|
|
from pydantic import BaseModel, Field
|
|
|
|
|
|
# Define the schema using Pydantic
|
|
class CapitalInfo(BaseModel):
|
|
name: str = Field(..., pattern=r"^\w+$", description="Name of the capital city")
|
|
population: int = Field(..., description="Population of the capital city")
|
|
|
|
|
|
response = client.chat.completions.create(
|
|
model="meta-llama/Meta-Llama-3.1-8B-Instruct",
|
|
messages=[
|
|
{
|
|
"role": "user",
|
|
"content": "Please generate the information of the capital of France in the JSON format.",
|
|
},
|
|
],
|
|
temperature=0,
|
|
max_tokens=128,
|
|
response_format={
|
|
"type": "json_schema",
|
|
"json_schema": {
|
|
"name": "foo",
|
|
# convert the pydantic model to json schema
|
|
"schema": CapitalInfo.model_json_schema(),
|
|
},
|
|
},
|
|
)
|
|
|
|
response_content = response.choices[0].message.content
|
|
# validate the JSON response by the pydantic model
|
|
capital_info = CapitalInfo.model_validate_json(response_content)
|
|
print_highlight(f"Validated response: {capital_info.model_dump_json()}")
|
|
```
|
|
|
|
**JSON Schema Directly**
|
|
|
|
|
|
|
|
|
|
```python Example
|
|
import json
|
|
|
|
json_schema = json.dumps(
|
|
{
|
|
"type": "object",
|
|
"properties": {
|
|
"name": {"type": "string", "pattern": "^[\\w]+$"},
|
|
"population": {"type": "integer"},
|
|
},
|
|
"required": ["name", "population"],
|
|
}
|
|
)
|
|
|
|
response = client.chat.completions.create(
|
|
model="meta-llama/Meta-Llama-3.1-8B-Instruct",
|
|
messages=[
|
|
{
|
|
"role": "user",
|
|
"content": "Give me the information of the capital of France in the JSON format.",
|
|
},
|
|
],
|
|
temperature=0,
|
|
max_tokens=128,
|
|
response_format={
|
|
"type": "json_schema",
|
|
"json_schema": {"name": "foo", "schema": json.loads(json_schema)},
|
|
},
|
|
)
|
|
|
|
print_highlight(response.choices[0].message.content)
|
|
```
|
|
|
|
### EBNF
|
|
|
|
|
|
|
|
```python Example
|
|
ebnf_grammar = """
|
|
root ::= city | description
|
|
city ::= "London" | "Paris" | "Berlin" | "Rome"
|
|
description ::= city " is " status
|
|
status ::= "the capital of " country
|
|
country ::= "England" | "France" | "Germany" | "Italy"
|
|
"""
|
|
|
|
response = client.chat.completions.create(
|
|
model="meta-llama/Meta-Llama-3.1-8B-Instruct",
|
|
messages=[
|
|
{"role": "system", "content": "You are a helpful geography bot."},
|
|
{
|
|
"role": "user",
|
|
"content": "Give me the information of the capital of France.",
|
|
},
|
|
],
|
|
temperature=0,
|
|
max_tokens=32,
|
|
extra_body={"ebnf": ebnf_grammar},
|
|
)
|
|
|
|
print_highlight(response.choices[0].message.content)
|
|
```
|
|
|
|
### Regular expression
|
|
|
|
|
|
|
|
```python Example
|
|
response = client.chat.completions.create(
|
|
model="meta-llama/Meta-Llama-3.1-8B-Instruct",
|
|
messages=[
|
|
{"role": "user", "content": "What is the capital of France?"},
|
|
],
|
|
temperature=0,
|
|
max_tokens=128,
|
|
extra_body={"regex": "(Paris|London)"},
|
|
)
|
|
|
|
print_highlight(response.choices[0].message.content)
|
|
```
|
|
|
|
### Structural Tag
|
|
|
|
|
|
|
|
```python Example
|
|
tool_get_current_weather = {
|
|
"type": "function",
|
|
"function": {
|
|
"name": "get_current_weather",
|
|
"description": "Get the current weather in a given location",
|
|
"parameters": {
|
|
"type": "object",
|
|
"properties": {
|
|
"city": {
|
|
"type": "string",
|
|
"description": "The city to find the weather for, e.g. 'San Francisco'",
|
|
},
|
|
"state": {
|
|
"type": "string",
|
|
"description": "the two-letter abbreviation for the state that the city is"
|
|
" in, e.g. 'CA' which would mean 'California'",
|
|
},
|
|
"unit": {
|
|
"type": "string",
|
|
"description": "The unit to fetch the temperature in",
|
|
"enum": ["celsius", "fahrenheit"],
|
|
},
|
|
},
|
|
"required": ["city", "state", "unit"],
|
|
},
|
|
},
|
|
}
|
|
|
|
tool_get_current_date = {
|
|
"type": "function",
|
|
"function": {
|
|
"name": "get_current_date",
|
|
"description": "Get the current date and time for a given timezone",
|
|
"parameters": {
|
|
"type": "object",
|
|
"properties": {
|
|
"timezone": {
|
|
"type": "string",
|
|
"description": "The timezone to fetch the current date and time for, e.g. 'America/New_York'",
|
|
}
|
|
},
|
|
"required": ["timezone"],
|
|
},
|
|
},
|
|
}
|
|
|
|
schema_get_current_weather = tool_get_current_weather["function"]["parameters"]
|
|
schema_get_current_date = tool_get_current_date["function"]["parameters"]
|
|
|
|
|
|
def get_messages():
|
|
return [
|
|
{
|
|
"role": "system",
|
|
"content": f"""
|
|
# Tool Instructions
|
|
- Always execute python code in messages that you share.
|
|
- When looking for real time information use relevant functions if available else fallback to brave_search
|
|
You have access to the following functions:
|
|
Use the function 'get_current_weather' to: Get the current weather in a given location
|
|
{tool_get_current_weather["function"]}
|
|
Use the function 'get_current_date' to: Get the current date and time for a given timezone
|
|
{tool_get_current_date["function"]}
|
|
If a you choose to call a function ONLY reply in the following format:
|
|
<{{start_tag}}={{function_name}}>{{parameters}}{{end_tag}}
|
|
where
|
|
start_tag => `<function`
|
|
parameters => a JSON dict with the function argument name as key and function argument value as value.
|
|
end_tag => `</function>`
|
|
Here is an example,
|
|
<function=example_function_name>{{"example_name": "example_value"}}</function>
|
|
Reminder:
|
|
- Function calls MUST follow the specified format
|
|
- Required parameters MUST be specified
|
|
- Only call one function at a time
|
|
- Put the entire function call reply on one line
|
|
- Always add your sources when using search results to answer the user query
|
|
You are a helpful assistant.""",
|
|
},
|
|
{
|
|
"role": "user",
|
|
"content": "You are in New York. Please get the current date and time, and the weather.",
|
|
},
|
|
]
|
|
|
|
|
|
messages = get_messages()
|
|
|
|
response = client.chat.completions.create(
|
|
model="meta-llama/Meta-Llama-3.1-8B-Instruct",
|
|
messages=messages,
|
|
response_format={
|
|
"type": "structural_tag",
|
|
"structures": [
|
|
{
|
|
"begin": "<function=get_current_weather>",
|
|
"schema": schema_get_current_weather,
|
|
"end": "</function>",
|
|
},
|
|
{
|
|
"begin": "<function=get_current_date>",
|
|
"schema": schema_get_current_date,
|
|
"end": "</function>",
|
|
},
|
|
],
|
|
"triggers": ["<function="],
|
|
},
|
|
)
|
|
|
|
print_highlight(response.choices[0].message.content)
|
|
```
|
|
|
|
|
|
```python Example
|
|
# Support for XGrammar latest structural tag format
|
|
# https://xgrammar.mlc.ai/docs/tutorials/structural_tag.html
|
|
|
|
response = client.chat.completions.create(
|
|
model="meta-llama/Meta-Llama-3.1-8B-Instruct",
|
|
messages=messages,
|
|
response_format={
|
|
"type": "structural_tag",
|
|
"format": {
|
|
"type": "triggered_tags",
|
|
"triggers": ["<function="],
|
|
"tags": [
|
|
{
|
|
"begin": "<function=get_current_weather>",
|
|
"content": {
|
|
"type": "json_schema",
|
|
"json_schema": schema_get_current_weather,
|
|
},
|
|
"end": "</function>",
|
|
},
|
|
{
|
|
"begin": "<function=get_current_date>",
|
|
"content": {
|
|
"type": "json_schema",
|
|
"json_schema": schema_get_current_date,
|
|
},
|
|
"end": "</function>",
|
|
},
|
|
],
|
|
"at_least_one": False,
|
|
"stop_after_first": False,
|
|
},
|
|
},
|
|
)
|
|
|
|
print_highlight(response.choices[0].message.content)
|
|
```
|
|
|
|
## Native API and SGLang Runtime (SRT)
|
|
|
|
|
|
### JSON
|
|
|
|
|
|
**Using Pydantic**
|
|
|
|
|
|
|
|
```python Example
|
|
import requests
|
|
import json
|
|
from pydantic import BaseModel, Field
|
|
|
|
from transformers import AutoTokenizer
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct")
|
|
|
|
|
|
# Define the schema using Pydantic
|
|
class CapitalInfo(BaseModel):
|
|
name: str = Field(..., pattern=r"^\w+$", description="Name of the capital city")
|
|
population: int = Field(..., description="Population of the capital city")
|
|
|
|
|
|
# Make API request
|
|
messages = [
|
|
{
|
|
"role": "user",
|
|
"content": "Here is the information of the capital of France in the JSON format.\n",
|
|
}
|
|
]
|
|
text = tokenizer.apply_chat_template(
|
|
messages, tokenize=False, add_generation_prompt=True, return_dict=False
|
|
)
|
|
response = requests.post(
|
|
f"http://localhost:{port}/generate",
|
|
json={
|
|
"text": text,
|
|
"sampling_params": {
|
|
"temperature": 0,
|
|
"max_new_tokens": 64,
|
|
"json_schema": json.dumps(CapitalInfo.model_json_schema()),
|
|
},
|
|
},
|
|
)
|
|
print_highlight(response.json())
|
|
|
|
|
|
response_data = json.loads(response.json()["text"])
|
|
# validate the response by the pydantic model
|
|
capital_info = CapitalInfo.model_validate(response_data)
|
|
print_highlight(f"Validated response: {capital_info.model_dump_json()}")
|
|
```
|
|
|
|
**JSON Schema Directly**
|
|
|
|
|
|
|
|
```python Example
|
|
json_schema = json.dumps(
|
|
{
|
|
"type": "object",
|
|
"properties": {
|
|
"name": {"type": "string", "pattern": "^[\\w]+$"},
|
|
"population": {"type": "integer"},
|
|
},
|
|
"required": ["name", "population"],
|
|
}
|
|
)
|
|
|
|
# JSON
|
|
response = requests.post(
|
|
f"http://localhost:{port}/generate",
|
|
json={
|
|
"text": text,
|
|
"sampling_params": {
|
|
"temperature": 0,
|
|
"max_new_tokens": 64,
|
|
"json_schema": json_schema,
|
|
},
|
|
},
|
|
)
|
|
|
|
print_highlight(response.json())
|
|
```
|
|
|
|
### EBNF
|
|
|
|
|
|
|
|
```python Example
|
|
messages = [
|
|
{
|
|
"role": "user",
|
|
"content": "Give me the information of the capital of France.",
|
|
}
|
|
]
|
|
text = tokenizer.apply_chat_template(
|
|
messages, tokenize=False, add_generation_prompt=True, return_dict=False
|
|
)
|
|
response = requests.post(
|
|
f"http://localhost:{port}/generate",
|
|
json={
|
|
"text": text,
|
|
"sampling_params": {
|
|
"max_new_tokens": 128,
|
|
"temperature": 0,
|
|
"n": 3,
|
|
"ebnf": (
|
|
"root ::= city | description\n"
|
|
'city ::= "London" | "Paris" | "Berlin" | "Rome"\n'
|
|
'description ::= city " is " status\n'
|
|
'status ::= "the capital of " country\n'
|
|
'country ::= "England" | "France" | "Germany" | "Italy"'
|
|
),
|
|
},
|
|
"stream": False,
|
|
"return_logprob": False,
|
|
},
|
|
)
|
|
|
|
print_highlight(response.json())
|
|
```
|
|
|
|
### Regular expression
|
|
|
|
|
|
|
|
```python Example
|
|
messages = [
|
|
{
|
|
"role": "user",
|
|
"content": "Paris is the capital of",
|
|
}
|
|
]
|
|
text = tokenizer.apply_chat_template(
|
|
messages, tokenize=False, add_generation_prompt=True, return_dict=False
|
|
)
|
|
response = requests.post(
|
|
f"http://localhost:{port}/generate",
|
|
json={
|
|
"text": text,
|
|
"sampling_params": {
|
|
"temperature": 0,
|
|
"max_new_tokens": 64,
|
|
"regex": "(France|England)",
|
|
},
|
|
},
|
|
)
|
|
print_highlight(response.json())
|
|
```
|
|
|
|
### Structural Tag
|
|
|
|
|
|
|
|
```python Example
|
|
from transformers import AutoTokenizer
|
|
|
|
# generate an answer
|
|
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct")
|
|
|
|
text = tokenizer.apply_chat_template(
|
|
messages, tokenize=False, add_generation_prompt=True, return_dict=False
|
|
)
|
|
payload = {
|
|
"text": text,
|
|
"sampling_params": {
|
|
"structural_tag": json.dumps(
|
|
{
|
|
"type": "structural_tag",
|
|
"structures": [
|
|
{
|
|
"begin": "<function=get_current_weather>",
|
|
"schema": schema_get_current_weather,
|
|
"end": "</function>",
|
|
},
|
|
{
|
|
"begin": "<function=get_current_date>",
|
|
"schema": schema_get_current_date,
|
|
"end": "</function>",
|
|
},
|
|
],
|
|
"triggers": ["<function="],
|
|
}
|
|
)
|
|
},
|
|
}
|
|
|
|
|
|
# Send POST request to the API endpoint
|
|
response = requests.post(f"http://localhost:{port}/generate", json=payload)
|
|
print_highlight(response.json())
|
|
```
|
|
|
|
|
|
```python Example
|
|
# Support for XGrammar latest structural tag format
|
|
# https://xgrammar.mlc.ai/docs/tutorials/structural_tag.html
|
|
|
|
payload = {
|
|
"text": text,
|
|
"sampling_params": {
|
|
"structural_tag": json.dumps(
|
|
{
|
|
"type": "structural_tag",
|
|
"format": {
|
|
"type": "triggered_tags",
|
|
"triggers": ["<function="],
|
|
"tags": [
|
|
{
|
|
"begin": "<function=get_current_weather>",
|
|
"content": {
|
|
"type": "json_schema",
|
|
"json_schema": schema_get_current_weather,
|
|
},
|
|
"end": "</function>",
|
|
},
|
|
{
|
|
"begin": "<function=get_current_date>",
|
|
"content": {
|
|
"type": "json_schema",
|
|
"json_schema": schema_get_current_date,
|
|
},
|
|
"end": "</function>",
|
|
},
|
|
],
|
|
"at_least_one": False,
|
|
"stop_after_first": False,
|
|
},
|
|
}
|
|
)
|
|
},
|
|
}
|
|
|
|
|
|
# Send POST request to the API endpoint
|
|
response = requests.post(f"http://localhost:{port}/generate", json=payload)
|
|
print_highlight(response.json())
|
|
```
|
|
|
|
|
|
```python Example
|
|
terminate_process(server_process)
|
|
```
|
|
|
|
## Offline Engine API
|
|
|
|
|
|
|
|
```python Example
|
|
import sglang as sgl
|
|
|
|
llm = sgl.Engine(
|
|
model_path="meta-llama/Meta-Llama-3.1-8B-Instruct", grammar_backend="xgrammar"
|
|
)
|
|
```
|
|
|
|
### JSON
|
|
|
|
|
|
**Using Pydantic**
|
|
|
|
|
|
|
|
```python Example
|
|
import json
|
|
from pydantic import BaseModel, Field
|
|
|
|
|
|
prompts = [
|
|
"Give me the information of the capital of China in the JSON format.",
|
|
"Give me the information of the capital of France in the JSON format.",
|
|
"Give me the information of the capital of Ireland in the JSON format.",
|
|
]
|
|
|
|
|
|
# Define the schema using Pydantic
|
|
class CapitalInfo(BaseModel):
|
|
name: str = Field(..., pattern=r"^\w+$", description="Name of the capital city")
|
|
population: int = Field(..., description="Population of the capital city")
|
|
|
|
|
|
sampling_params = {
|
|
"temperature": 0.1,
|
|
"top_p": 0.95,
|
|
"json_schema": json.dumps(CapitalInfo.model_json_schema()),
|
|
}
|
|
|
|
outputs = llm.generate(prompts, sampling_params)
|
|
for prompt, output in zip(prompts, outputs):
|
|
print_highlight("===============================")
|
|
print_highlight(f"Prompt: {prompt}") # validate the output by the pydantic model
|
|
capital_info = CapitalInfo.model_validate_json(output["text"])
|
|
print_highlight(f"Validated output: {capital_info.model_dump_json()}")
|
|
```
|
|
|
|
**JSON Schema Directly**
|
|
|
|
|
|
|
|
```python Example
|
|
prompts = [
|
|
"Give me the information of the capital of China in the JSON format.",
|
|
"Give me the information of the capital of France in the JSON format.",
|
|
"Give me the information of the capital of Ireland in the JSON format.",
|
|
]
|
|
|
|
json_schema = json.dumps(
|
|
{
|
|
"type": "object",
|
|
"properties": {
|
|
"name": {"type": "string", "pattern": "^[\\w]+$"},
|
|
"population": {"type": "integer"},
|
|
},
|
|
"required": ["name", "population"],
|
|
}
|
|
)
|
|
|
|
sampling_params = {"temperature": 0.1, "top_p": 0.95, "json_schema": json_schema}
|
|
|
|
outputs = llm.generate(prompts, sampling_params)
|
|
for prompt, output in zip(prompts, outputs):
|
|
print_highlight("===============================")
|
|
print_highlight(f"Prompt: {prompt}\nGenerated text: {output['text']}")
|
|
```
|
|
|
|
### EBNF
|
|
|
|
|
|
|
|
|
|
```python Example
|
|
prompts = [
|
|
"Give me the information of the capital of France.",
|
|
"Give me the information of the capital of Germany.",
|
|
"Give me the information of the capital of Italy.",
|
|
]
|
|
|
|
sampling_params = {
|
|
"temperature": 0.8,
|
|
"top_p": 0.95,
|
|
"ebnf": (
|
|
"root ::= city | description\n"
|
|
'city ::= "London" | "Paris" | "Berlin" | "Rome"\n'
|
|
'description ::= city " is " status\n'
|
|
'status ::= "the capital of " country\n'
|
|
'country ::= "England" | "France" | "Germany" | "Italy"'
|
|
),
|
|
}
|
|
|
|
outputs = llm.generate(prompts, sampling_params)
|
|
for prompt, output in zip(prompts, outputs):
|
|
print_highlight("===============================")
|
|
print_highlight(f"Prompt: {prompt}\nGenerated text: {output['text']}")
|
|
```
|
|
|
|
### Regular expression
|
|
|
|
|
|
|
|
```python Example
|
|
prompts = [
|
|
"Please provide information about London as a major global city:",
|
|
"Please provide information about Paris as a major global city:",
|
|
]
|
|
|
|
sampling_params = {"temperature": 0.8, "top_p": 0.95, "regex": "(France|England)"}
|
|
|
|
outputs = llm.generate(prompts, sampling_params)
|
|
for prompt, output in zip(prompts, outputs):
|
|
print_highlight("===============================")
|
|
print_highlight(f"Prompt: {prompt}\nGenerated text: {output['text']}")
|
|
```
|
|
|
|
### Structural Tag
|
|
|
|
|
|
|
|
```python Example
|
|
text = tokenizer.apply_chat_template(
|
|
messages, tokenize=False, add_generation_prompt=True, return_dict=False
|
|
)
|
|
prompts = [text]
|
|
|
|
|
|
sampling_params = {
|
|
"temperature": 0.8,
|
|
"top_p": 0.95,
|
|
"structural_tag": json.dumps(
|
|
{
|
|
"type": "structural_tag",
|
|
"structures": [
|
|
{
|
|
"begin": "<function=get_current_weather>",
|
|
"schema": schema_get_current_weather,
|
|
"end": "</function>",
|
|
},
|
|
{
|
|
"begin": "<function=get_current_date>",
|
|
"schema": schema_get_current_date,
|
|
"end": "</function>",
|
|
},
|
|
],
|
|
"triggers": ["<function="],
|
|
}
|
|
),
|
|
}
|
|
|
|
|
|
# Send POST request to the API endpoint
|
|
outputs = llm.generate(prompts, sampling_params)
|
|
for prompt, output in zip(prompts, outputs):
|
|
print_highlight("===============================")
|
|
print_highlight(f"Prompt: {prompt}\nGenerated text: {output['text']}")
|
|
```
|
|
|
|
|
|
```python Example
|
|
# Support for XGrammar latest structural tag format
|
|
# https://xgrammar.mlc.ai/docs/tutorials/structural_tag.html
|
|
|
|
sampling_params = {
|
|
"temperature": 0.8,
|
|
"top_p": 0.95,
|
|
"structural_tag": json.dumps(
|
|
{
|
|
"type": "structural_tag",
|
|
"format": {
|
|
"type": "triggered_tags",
|
|
"triggers": ["<function="],
|
|
"tags": [
|
|
{
|
|
"begin": "<function=get_current_weather>",
|
|
"content": {
|
|
"type": "json_schema",
|
|
"json_schema": schema_get_current_weather,
|
|
},
|
|
"end": "</function>",
|
|
},
|
|
{
|
|
"begin": "<function=get_current_date>",
|
|
"content": {
|
|
"type": "json_schema",
|
|
"json_schema": schema_get_current_date,
|
|
},
|
|
"end": "</function>",
|
|
},
|
|
],
|
|
"at_least_one": False,
|
|
"stop_after_first": False,
|
|
},
|
|
}
|
|
),
|
|
}
|
|
|
|
|
|
# Send POST request to the API endpoint
|
|
outputs = llm.generate(prompts, sampling_params)
|
|
for prompt, output in zip(prompts, outputs):
|
|
print_highlight("===============================")
|
|
print_highlight(f"Prompt: {prompt}\nGenerated text: {output['text']}")
|
|
```
|
|
|
|
|
|
```python Example
|
|
llm.shutdown()
|
|
```
|