[Feature] Edge Text to speech #34

This commit is contained in:
SillyLossy
2023-06-04 12:38:36 +03:00
parent a0000347e9
commit a707dd02ff
5 changed files with 98 additions and 6 deletions

View File

@@ -121,7 +121,8 @@ cd SillyTavern-extras
| `summarize` | Text summarization | ✔️ Yes |
| `classify` | Text sentiment classification | ✔️ Yes |
| `sd` | Stable Diffusion image generation | :x: No (✔️ remote) |
| `tts` | [Silero TTS server](https://github.com/ouoertheo/silero-api-server) | :x: No |
| `silero-tts` | [Silero TTS server](https://github.com/ouoertheo/silero-api-server) | :x: No |
| `edge-tts` | [Microsoft Edge TTS client](https://github.com/rany2/edge-tts) | ✔️ Yes |
| `chromadb` | Infinity context server | :x: No |
@@ -308,7 +309,7 @@ None
{ "previous_model": "name of the previous model", "current_model": "name of the newly loaded model" }
```
### Generate TTS voice
### Generate Silero TTS voice
`POST /api/tts/generate`
#### **Input**
```
@@ -317,7 +318,7 @@ None
#### **Output**
WAV audio file.
### Get TTS voices
### Get Silero TTS voices
`GET /api/tts/speakers`
#### **Output**
```
@@ -330,7 +331,7 @@ WAV audio file.
]
```
### Get TTS voice sample
### Get Silero TTS voice sample
`GET /api/tts/sample/<voice_id>`
#### **Output**
WAV audio file.
@@ -400,3 +401,19 @@ WAV audio file.
```
{ "chat_id": "chat1 - 2023-04-12" }
```
### Get a list of Edge TTS voices
`GET /api/edge-tts/list`
#### **Output**
```
[{'Name': 'Microsoft Server Speech Text to Speech Voice (af-ZA, AdriNeural)', 'ShortName': 'af-ZA-AdriNeural', 'Gender': 'Female', 'Locale': 'af-ZA', 'SuggestedCodec': 'audio-24khz-48kbitrate-mono-mp3', 'FriendlyName': 'Microsoft Adri Online (Natural) - Afrikaans (South Africa)', 'Status': 'GA', 'VoiceTag': {'ContentCategories': ['General'], 'VoicePersonalities': ['Friendly', 'Positive']}}]
```
### Generate Edge TTS voice
`POST /api/edge-tts/generate`
#### **Input**
```
{ "text": "Text to narrate", "voice": "af-ZA-AdriNeural" }
```
#### **Output**
MP3 audio file.

View File

@@ -14,4 +14,5 @@ transformers==4.28.1
diffusers==0.16.1
silero-api-server
chromadb
sentence_transformers
sentence_transformers
edge-tts

View File

@@ -7,4 +7,5 @@ colorama
--extra-index-url https://download.pytorch.org/whl/cu117
torch==2.0.0+cu117
transformers==4.28.1
webuiapi
webuiapi
edge-tts

View File

@@ -3,6 +3,7 @@ from flask import (
Flask,
jsonify,
request,
Response,
render_template_string,
abort,
send_from_directory,
@@ -207,6 +208,12 @@ elif "sd" in modules and sd_use_remote:
modules.remove("sd")
if "tts" in modules:
print("tts module is deprecated. Please use silero-tts instead.")
modules.remove("tts")
modules.append("silero-tts")
if "silero-tts" in modules:
if not os.path.exists(SILERO_SAMPLES_PATH):
os.makedirs(SILERO_SAMPLES_PATH)
print("Initializing Silero TTS server")
@@ -218,6 +225,12 @@ if "tts" in modules:
tts_service.update_sample_text(SILERO_SAMPLE_TEXT)
tts_service.generate_samples()
if "edge-tts" in modules:
print("Initializing Edge TTS client")
import tts_edge as edge
if "chromadb" in modules:
print("Initializing ChromaDB")
import chromadb
@@ -609,6 +622,7 @@ def get_modules():
@app.route("/api/tts/speakers", methods=["GET"])
@require_module("silero-tts")
def tts_speakers():
voices = [
{
@@ -622,6 +636,7 @@ def tts_speakers():
@app.route("/api/tts/generate", methods=["POST"])
@require_module("silero-tts")
def tts_generate():
voice = request.get_json()
if "text" not in voice or not isinstance(voice["text"], str):
@@ -639,10 +654,36 @@ def tts_generate():
@app.route("/api/tts/sample/<speaker>", methods=["GET"])
@require_module("silero-tts")
def tts_play_sample(speaker: str):
return send_from_directory(SILERO_SAMPLES_PATH, f"{speaker}.wav")
@app.route("/api/edge-tts/list", methods=["GET"])
@require_module("edge-tts")
def edge_tts_list():
voices = edge.get_voices()
return jsonify(voices)
@app.route("/api/edge-tts/generate", methods=["POST"])
@require_module("edge-tts")
def edge_tts_generate():
data = request.get_json()
if "text" not in data or not isinstance(data["text"], str):
abort(400, '"text" is required')
if "voice" not in data or not isinstance(data["voice"], str):
abort(400, '"voice" is required')
# Remove asterisks
data["text"] = data["text"].replace("*", "")
try:
audio = edge.generate_audio(text=data["text"], voice=data["voice"])
return Response(audio, mimetype="audio/mpeg")
except Exception as e:
print(e)
abort(500, data["voice"])
@app.route("/api/chromadb", methods=["POST"])
@require_module("chromadb")
def chromadb_add_messages():

32
tts_edge.py Normal file
View File

@@ -0,0 +1,32 @@
import io
import edge_tts
import asyncio
def get_voices():
voices = asyncio.run(edge_tts.list_voices())
return voices
async def _iterate_chunks(audio):
async for chunk in audio.stream():
if chunk["type"] == "audio":
yield chunk["data"]
async def _async_generator_to_list(async_gen):
result = []
async for item in async_gen:
result.append(item)
return result
def generate_audio(text: str, voice: str) -> bytes:
audio = edge_tts.Communicate(text, voice)
chunks = asyncio.run(_async_generator_to_list(_iterate_chunks(audio)))
buffer = io.BytesIO()
for chunk in chunks:
buffer.write(chunk)
return buffer.getvalue()