merge speech recoginition module with neo branch.

This commit is contained in:
Tony Ribeiro
2023-07-27 20:29:17 +02:00
parent 9b1e3d1250
commit d528c5e5ad
4 changed files with 162 additions and 0 deletions

View File

@@ -0,0 +1,77 @@
"""
Speech-to-text module based on Vosk for SillyTavern Extras
- Vosk website: https://alphacephei.com/vosk/
- Vosk api: https://github.com/alphacep/vosk-api
Authors:
- Tony Ribeiro (https://github.com/Tony-sama)
Models are saved into user cache folder, example: C:/Users/toto/.cache/vosk
References:
- Code adapted from: https://github.com/alphacep/vosk-api/blob/master/python/example/test_simple.py
"""
from flask import jsonify, abort, request
import wave
from vosk import Model, KaldiRecognizer, SetLogLevel
import soundfile
DEBUG_PREFIX = "<stt vosk module>"
RECORDING_FILE_PATH = "stt_test.wav"
model = None
SetLogLevel(-1)
def load_model(file_path=None):
"""
Load given vosk model from file or default to en-us model.
Download model to user cache folder, example: C:/Users/toto/.cache/vosk
"""
if file_path is None:
return Model(lang="en-us")
else:
return Model(file_path)
def process_audio():
"""
Transcript request audio file to text using Whisper
"""
if model is None:
print(DEBUG_PREFIX,"Vosk model not initialized yet.")
return ""
try:
file = request.files.get('AudioFile')
file.save(RECORDING_FILE_PATH)
# Read and rewrite the file with soundfile
data, samplerate = soundfile.read(RECORDING_FILE_PATH)
soundfile.write(RECORDING_FILE_PATH, data, samplerate)
wf = wave.open(RECORDING_FILE_PATH, "rb")
if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype() != "NONE":
print("Audio file must be WAV format mono PCM.")
abort(500, DEBUG_PREFIX+" Audio file must be WAV format mono PCM.")
rec = KaldiRecognizer(model, wf.getframerate())
#rec.SetWords(True)
#rec.SetPartialWords(True)
while True:
data = wf.readframes(4000)
if len(data) == 0:
break
if rec.AcceptWaveform(data):
break
transcript = rec.Result()[14:-3]
print(DEBUG_PREFIX, "Transcripted from request audio file:", transcript)
return jsonify({"transcript": transcript})
except Exception as e: # No exception observed during test but we never know
print(e)
abort(500, DEBUG_PREFIX+" Exception occurs while processing audio")

View File

@@ -0,0 +1,57 @@
"""
Speech-to-text module based on Whisper for SillyTavern Extras
- Whisper github: https://github.com/openai/whisper
Authors:
- Tony Ribeiro (https://github.com/Tony-sama)
Models are saved into user cache folder, example: C:/Users/toto/.cache/whisper
References:
- Code adapted from:
- whisper github: https://github.com/openai/whisper
- oobabooga text-generation-webui github: https://github.com/oobabooga/text-generation-webui
"""
from random import sample
from flask import jsonify, abort, request
import whisper
DEBUG_PREFIX = "<stt whisper module>"
RECORDING_FILE_PATH = "stt_test.wav"
model = None
def load_model(file_path=None):
"""
Load given vosk model from file or default to en-us model.
Download model to user cache folder, example: C:/Users/toto/.cache/vosk
"""
if file_path is None:
return whisper.load_model("base.en")
else:
return whisper.load_model(file_path)
def process_audio():
"""
Transcript request audio file to text using Whisper
"""
if model is None:
print(DEBUG_PREFIX,"Whisper model not initialized yet.")
return ""
try:
file = request.files.get('AudioFile')
file.save(RECORDING_FILE_PATH)
result = model.transcribe(RECORDING_FILE_PATH)
transcript = result["text"]
print(DEBUG_PREFIX, "Transcripted from audio file (whisper):", transcript)
return jsonify({"transcript": transcript})
except Exception as e: # No exception observed during test but we never know
print(e)
abort(500, DEBUG_PREFIX+" Exception occurs while processing audio")

View File

@@ -18,3 +18,6 @@ chromadb
sentence_transformers
edge-tts
TTS
vosk
sounddevice
openai-whisper

View File

@@ -84,6 +84,8 @@ parser.add_argument(
)
parser.add_argument("--coqui-gpu", action="store_false", help="Run the voice models on the GPU (CPU is default)")
parser.add_argument("--coqui-model", help="Load a custom Coqui TTS model")
parser.add_argument("--stt-vosk-model-path", help="Load a custom vosk speech-to-text model")
parser.add_argument("--stt-whisper-model-path", help="Load a custom vosk speech-to-text model")
sd_group = parser.add_mutually_exclusive_group()
local_sd = sd_group.add_argument_group("sd-local")
@@ -309,6 +311,29 @@ CORS(app) # allow cross-domain requests
Compress(app) # compress responses
app.config["MAX_CONTENT_LENGTH"] = 100 * 1024 * 1024
if "vosk-stt" in modules:
print("Initializing Vosk STT streaming")
vosk_model_path = (
args.stt_vosk_model_path
if args.stt_vosk_model_path
else None)
import modules.speech_recognition.vosk_module as vosk_module
vosk_module.model = vosk_module.load_model(file_path=vosk_model_path)
app.add_url_rule("/api/speech-recognition/vosk/process-audio", view_func=vosk_module.process_audio, methods=["POST"])
if "whisper-stt" in modules:
print("Initializing Whisper STT streaming")
whisper_model_path = (
args.stt_whisper_model_path
if args.stt_whisper_model_path
else None)
import modules.speech_recognition.whisper_module as whisper_module
whisper_module.model = whisper_module.load_model(file_path=whisper_model_path)
app.add_url_rule("/api/speech-recognition/whisper/process-audio", view_func=whisper_module.process_audio, methods=["POST"])
def require_module(name):
def wrapper(fn):