merge speech recoginition module with neo branch.

2026-04-23 16:08:57 +00:00 · 2023-07-27 20:29:17 +02:00
parent 9b1e3d1250
commit d528c5e5ad
4 changed files with 162 additions and 0 deletions
--- a/modules/speech_recognition/vosk_module.py
+++ b/modules/speech_recognition/vosk_module.py
@@ -0,0 +1,77 @@
+"""
+Speech-to-text module based on Vosk for SillyTavern Extras
+    - Vosk website: https://alphacephei.com/vosk/
+    - Vosk api: https://github.com/alphacep/vosk-api
+
+Authors:
+    - Tony Ribeiro (https://github.com/Tony-sama)
+
+Models are saved into user cache folder, example: C:/Users/toto/.cache/vosk
+
+References:
+    - Code adapted from: https://github.com/alphacep/vosk-api/blob/master/python/example/test_simple.py
+"""
+from flask import jsonify, abort, request
+
+import wave
+from vosk import Model, KaldiRecognizer, SetLogLevel
+import soundfile
+
+DEBUG_PREFIX = "<stt vosk module>"
+RECORDING_FILE_PATH = "stt_test.wav"
+
+model = None
+
+SetLogLevel(-1)
+
+def load_model(file_path=None):
+    """
+    Load given vosk model from file or default to en-us model.
+    Download model to user cache folder, example: C:/Users/toto/.cache/vosk
+    """
+
+    if file_path is None:
+        return Model(lang="en-us")
+    else:
+        return Model(file_path)
+
+def process_audio():
+    """
+    Transcript request audio file to text using Whisper
+    """
+
+    if model is None:
+        print(DEBUG_PREFIX,"Vosk model not initialized yet.")
+        return ""
+
+    try:    
+        file = request.files.get('AudioFile')
+        file.save(RECORDING_FILE_PATH)
+
+        # Read and rewrite the file with soundfile
+        data, samplerate = soundfile.read(RECORDING_FILE_PATH)
+        soundfile.write(RECORDING_FILE_PATH, data, samplerate)
+
+        wf = wave.open(RECORDING_FILE_PATH, "rb")
+        if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype() != "NONE":
+            print("Audio file must be WAV format mono PCM.")
+            abort(500, DEBUG_PREFIX+" Audio file must be WAV format mono PCM.")
+
+        rec = KaldiRecognizer(model, wf.getframerate())
+        #rec.SetWords(True)
+        #rec.SetPartialWords(True)
+
+        while True:
+            data = wf.readframes(4000)
+            if len(data) == 0:
+                break
+            if rec.AcceptWaveform(data):
+                break
+        
+        transcript = rec.Result()[14:-3]
+        print(DEBUG_PREFIX, "Transcripted from request audio file:", transcript)
+        return jsonify({"transcript": transcript})
+
+    except Exception as e: # No exception observed during test but we never know
+        print(e)
+        abort(500, DEBUG_PREFIX+" Exception occurs while processing audio")
--- a/modules/speech_recognition/whisper_module.py
+++ b/modules/speech_recognition/whisper_module.py
@@ -0,0 +1,57 @@
+"""
+Speech-to-text module based on Whisper for SillyTavern Extras
+    - Whisper github: https://github.com/openai/whisper
+
+Authors:
+    - Tony Ribeiro (https://github.com/Tony-sama)
+
+Models are saved into user cache folder, example: C:/Users/toto/.cache/whisper
+
+References:
+    - Code adapted from:
+        - whisper github: https://github.com/openai/whisper
+        - oobabooga text-generation-webui github: https://github.com/oobabooga/text-generation-webui
+"""
+from random import sample
+from flask import jsonify, abort, request
+
+import whisper
+
+DEBUG_PREFIX = "<stt whisper module>"
+RECORDING_FILE_PATH = "stt_test.wav"
+
+model = None
+
+def load_model(file_path=None):
+    """
+    Load given vosk model from file or default to en-us model.
+    Download model to user cache folder, example: C:/Users/toto/.cache/vosk
+    """
+
+    if file_path is None:
+        return whisper.load_model("base.en")
+    else:
+        return whisper.load_model(file_path)
+    
+def process_audio():
+    """
+    Transcript request audio file to text using Whisper
+    """
+
+    if model is None:
+        print(DEBUG_PREFIX,"Whisper model not initialized yet.")
+        return ""
+
+    try:    
+        file = request.files.get('AudioFile')
+        file.save(RECORDING_FILE_PATH)
+          
+        result = model.transcribe(RECORDING_FILE_PATH)
+        transcript = result["text"]
+        print(DEBUG_PREFIX, "Transcripted from audio file (whisper):", transcript)
+
+        return jsonify({"transcript": transcript})
+
+    except Exception as e: # No exception observed during test but we never know
+        print(e)
+        abort(500, DEBUG_PREFIX+" Exception occurs while processing audio")
--- a/requirements-complete.txt
+++ b/requirements-complete.txt
@@ -18,3 +18,6 @@ chromadb
 sentence_transformers
 edge-tts
 TTS
+vosk
+sounddevice
+openai-whisper
--- a/server.py
+++ b/server.py
@@ -84,6 +84,8 @@ parser.add_argument(
 )
 parser.add_argument("--coqui-gpu", action="store_false", help="Run the voice models on the GPU (CPU is default)")
 parser.add_argument("--coqui-model", help="Load a custom Coqui TTS model")
+parser.add_argument("--stt-vosk-model-path", help="Load a custom vosk speech-to-text model")
+parser.add_argument("--stt-whisper-model-path", help="Load a custom vosk speech-to-text model")
 sd_group = parser.add_mutually_exclusive_group()

 local_sd = sd_group.add_argument_group("sd-local")
@@ -309,6 +311,29 @@ CORS(app)  # allow cross-domain requests
 Compress(app) # compress responses
 app.config["MAX_CONTENT_LENGTH"] = 100 * 1024 * 1024

+if "vosk-stt" in modules:
+    print("Initializing Vosk STT streaming")
+    vosk_model_path = (
+    args.stt_vosk_model_path
+    if args.stt_vosk_model_path
+    else None)
+
+    import modules.speech_recognition.vosk_module as vosk_module
+
+    vosk_module.model = vosk_module.load_model(file_path=vosk_model_path)
+    app.add_url_rule("/api/speech-recognition/vosk/process-audio", view_func=vosk_module.process_audio, methods=["POST"])
+
+if "whisper-stt" in modules:
+    print("Initializing Whisper STT streaming")
+    whisper_model_path = (
+    args.stt_whisper_model_path
+    if args.stt_whisper_model_path
+    else None)
+
+    import modules.speech_recognition.whisper_module as whisper_module
+
+    whisper_module.model = whisper_module.load_model(file_path=whisper_model_path)
+    app.add_url_rule("/api/speech-recognition/whisper/process-audio", view_func=whisper_module.process_audio, methods=["POST"])

 def require_module(name):
    def wrapper(fn):