mirror of
https://github.com/SillyTavern/SillyTavern-Extras.git
synced 2026-04-28 18:31:19 +00:00
[Feature] Edge Text to speech #34
This commit is contained in:
25
README.md
25
README.md
@@ -121,7 +121,8 @@ cd SillyTavern-extras
|
||||
| `summarize` | Text summarization | ✔️ Yes |
|
||||
| `classify` | Text sentiment classification | ✔️ Yes |
|
||||
| `sd` | Stable Diffusion image generation | :x: No (✔️ remote) |
|
||||
| `tts` | [Silero TTS server](https://github.com/ouoertheo/silero-api-server) | :x: No |
|
||||
| `silero-tts` | [Silero TTS server](https://github.com/ouoertheo/silero-api-server) | :x: No |
|
||||
| `edge-tts` | [Microsoft Edge TTS client](https://github.com/rany2/edge-tts) | ✔️ Yes |
|
||||
| `chromadb` | Infinity context server | :x: No |
|
||||
|
||||
|
||||
@@ -308,7 +309,7 @@ None
|
||||
{ "previous_model": "name of the previous model", "current_model": "name of the newly loaded model" }
|
||||
```
|
||||
|
||||
### Generate TTS voice
|
||||
### Generate Silero TTS voice
|
||||
`POST /api/tts/generate`
|
||||
#### **Input**
|
||||
```
|
||||
@@ -317,7 +318,7 @@ None
|
||||
#### **Output**
|
||||
WAV audio file.
|
||||
|
||||
### Get TTS voices
|
||||
### Get Silero TTS voices
|
||||
`GET /api/tts/speakers`
|
||||
#### **Output**
|
||||
```
|
||||
@@ -330,7 +331,7 @@ WAV audio file.
|
||||
]
|
||||
```
|
||||
|
||||
### Get TTS voice sample
|
||||
### Get Silero TTS voice sample
|
||||
`GET /api/tts/sample/<voice_id>`
|
||||
#### **Output**
|
||||
WAV audio file.
|
||||
@@ -400,3 +401,19 @@ WAV audio file.
|
||||
```
|
||||
{ "chat_id": "chat1 - 2023-04-12" }
|
||||
```
|
||||
|
||||
### Get a list of Edge TTS voices
|
||||
`GET /api/edge-tts/list`
|
||||
#### **Output**
|
||||
```
|
||||
[{'Name': 'Microsoft Server Speech Text to Speech Voice (af-ZA, AdriNeural)', 'ShortName': 'af-ZA-AdriNeural', 'Gender': 'Female', 'Locale': 'af-ZA', 'SuggestedCodec': 'audio-24khz-48kbitrate-mono-mp3', 'FriendlyName': 'Microsoft Adri Online (Natural) - Afrikaans (South Africa)', 'Status': 'GA', 'VoiceTag': {'ContentCategories': ['General'], 'VoicePersonalities': ['Friendly', 'Positive']}}]
|
||||
```
|
||||
|
||||
### Generate Edge TTS voice
|
||||
`POST /api/edge-tts/generate`
|
||||
#### **Input**
|
||||
```
|
||||
{ "text": "Text to narrate", "voice": "af-ZA-AdriNeural" }
|
||||
```
|
||||
#### **Output**
|
||||
MP3 audio file.
|
||||
|
||||
@@ -14,4 +14,5 @@ transformers==4.28.1
|
||||
diffusers==0.16.1
|
||||
silero-api-server
|
||||
chromadb
|
||||
sentence_transformers
|
||||
sentence_transformers
|
||||
edge-tts
|
||||
@@ -7,4 +7,5 @@ colorama
|
||||
--extra-index-url https://download.pytorch.org/whl/cu117
|
||||
torch==2.0.0+cu117
|
||||
transformers==4.28.1
|
||||
webuiapi
|
||||
webuiapi
|
||||
edge-tts
|
||||
41
server.py
41
server.py
@@ -3,6 +3,7 @@ from flask import (
|
||||
Flask,
|
||||
jsonify,
|
||||
request,
|
||||
Response,
|
||||
render_template_string,
|
||||
abort,
|
||||
send_from_directory,
|
||||
@@ -207,6 +208,12 @@ elif "sd" in modules and sd_use_remote:
|
||||
modules.remove("sd")
|
||||
|
||||
if "tts" in modules:
|
||||
print("tts module is deprecated. Please use silero-tts instead.")
|
||||
modules.remove("tts")
|
||||
modules.append("silero-tts")
|
||||
|
||||
|
||||
if "silero-tts" in modules:
|
||||
if not os.path.exists(SILERO_SAMPLES_PATH):
|
||||
os.makedirs(SILERO_SAMPLES_PATH)
|
||||
print("Initializing Silero TTS server")
|
||||
@@ -218,6 +225,12 @@ if "tts" in modules:
|
||||
tts_service.update_sample_text(SILERO_SAMPLE_TEXT)
|
||||
tts_service.generate_samples()
|
||||
|
||||
|
||||
if "edge-tts" in modules:
|
||||
print("Initializing Edge TTS client")
|
||||
import tts_edge as edge
|
||||
|
||||
|
||||
if "chromadb" in modules:
|
||||
print("Initializing ChromaDB")
|
||||
import chromadb
|
||||
@@ -609,6 +622,7 @@ def get_modules():
|
||||
|
||||
|
||||
@app.route("/api/tts/speakers", methods=["GET"])
|
||||
@require_module("silero-tts")
|
||||
def tts_speakers():
|
||||
voices = [
|
||||
{
|
||||
@@ -622,6 +636,7 @@ def tts_speakers():
|
||||
|
||||
|
||||
@app.route("/api/tts/generate", methods=["POST"])
|
||||
@require_module("silero-tts")
|
||||
def tts_generate():
|
||||
voice = request.get_json()
|
||||
if "text" not in voice or not isinstance(voice["text"], str):
|
||||
@@ -639,10 +654,36 @@ def tts_generate():
|
||||
|
||||
|
||||
@app.route("/api/tts/sample/<speaker>", methods=["GET"])
|
||||
@require_module("silero-tts")
|
||||
def tts_play_sample(speaker: str):
|
||||
return send_from_directory(SILERO_SAMPLES_PATH, f"{speaker}.wav")
|
||||
|
||||
|
||||
@app.route("/api/edge-tts/list", methods=["GET"])
|
||||
@require_module("edge-tts")
|
||||
def edge_tts_list():
|
||||
voices = edge.get_voices()
|
||||
return jsonify(voices)
|
||||
|
||||
|
||||
@app.route("/api/edge-tts/generate", methods=["POST"])
|
||||
@require_module("edge-tts")
|
||||
def edge_tts_generate():
|
||||
data = request.get_json()
|
||||
if "text" not in data or not isinstance(data["text"], str):
|
||||
abort(400, '"text" is required')
|
||||
if "voice" not in data or not isinstance(data["voice"], str):
|
||||
abort(400, '"voice" is required')
|
||||
# Remove asterisks
|
||||
data["text"] = data["text"].replace("*", "")
|
||||
try:
|
||||
audio = edge.generate_audio(text=data["text"], voice=data["voice"])
|
||||
return Response(audio, mimetype="audio/mpeg")
|
||||
except Exception as e:
|
||||
print(e)
|
||||
abort(500, data["voice"])
|
||||
|
||||
|
||||
@app.route("/api/chromadb", methods=["POST"])
|
||||
@require_module("chromadb")
|
||||
def chromadb_add_messages():
|
||||
|
||||
32
tts_edge.py
Normal file
32
tts_edge.py
Normal file
@@ -0,0 +1,32 @@
|
||||
import io
|
||||
import edge_tts
|
||||
import asyncio
|
||||
|
||||
|
||||
def get_voices():
|
||||
voices = asyncio.run(edge_tts.list_voices())
|
||||
return voices
|
||||
|
||||
|
||||
async def _iterate_chunks(audio):
|
||||
async for chunk in audio.stream():
|
||||
if chunk["type"] == "audio":
|
||||
yield chunk["data"]
|
||||
|
||||
|
||||
async def _async_generator_to_list(async_gen):
|
||||
result = []
|
||||
async for item in async_gen:
|
||||
result.append(item)
|
||||
return result
|
||||
|
||||
|
||||
def generate_audio(text: str, voice: str) -> bytes:
|
||||
audio = edge_tts.Communicate(text, voice)
|
||||
chunks = asyncio.run(_async_generator_to_list(_iterate_chunks(audio)))
|
||||
buffer = io.BytesIO()
|
||||
|
||||
for chunk in chunks:
|
||||
buffer.write(chunk)
|
||||
|
||||
return buffer.getvalue()
|
||||
Reference in New Issue
Block a user