added arguments for faster-whisper usage without breaking original whisper implementation

2026-01-26 17:20:04 +00:00 · 2024-12-10 00:55:46 +08:00
parent 2ef37240b9
commit dbec27b18a
2 changed files with 90 additions and 4 deletions
--- a/modules/speech_recognition/faster_whisper_module.py
+++ b/modules/speech_recognition/faster_whisper_module.py
@@ -0,0 +1,60 @@
+"""
+Speech-to-text module based on Whisper for SillyTavern Extras
+    - Whisper github: https://github.com/openai/whisper
+
+Authors:
+    - Tony Ribeiro (https://github.com/Tony-sama)
+
+Models are saved into user cache folder, example: C:/Users/toto/.cache/whisper
+
+References:
+    - Code adapted from:
+        - whisper github: https://github.com/openai/whisper
+        - oobabooga text-generation-webui github: https://github.com/oobabooga/text-generation-webui
+"""
+from flask import jsonify, abort, request
+
+from faster_whisper import WhisperModel
+
+DEBUG_PREFIX = "<stt whisper module>"
+RECORDING_FILE_PATH = "stt_test.wav"
+
+model_size = "large-v3-turbo"
+
+def load_model(file_path=None,whisper_device="cuda",whisper_compute_type='float16'):
+    """
+    Load given vosk model from file or default to en-us model.
+    Download model to user cache folder, example: C:/Users/toto/.cache/vosk
+    """
+
+    if file_path is None:
+        print(f"faster-whisper using {model_size}")
+        return WhisperModel(model_size, device=whisper_device, compute_type=whisper_compute_type)
+    else:
+        print(f"faster-whisper using {file_path}")
+        return WhisperModel(file_path, device=whisper_device, compute_type=whisper_compute_type)
+
+def process_audio():
+    """
+    Transcript request audio file to text using Whisper
+    """
+
+    if model is None:
+        print(DEBUG_PREFIX,"Whisper model not initialized yet.")
+        return ""
+
+    try:
+        file = request.files.get('AudioFile')
+        language = request.form.get('language', default=None)
+        file.save(RECORDING_FILE_PATH)
+        segments, info = model.transcribe(RECORDING_FILE_PATH, beam_size=5)
+        transcript=""
+        for segment in segments:
+            transcript=transcript+" "+segment.text
+        print(DEBUG_PREFIX, "Transcripted from audio file (whisper):", transcript)
+
+        return jsonify({"transcript": transcript})
+
+    except Exception as e: # No exception observed during test but we never know
+        print(e)
+        abort(500, DEBUG_PREFIX+" Exception occurs while processing audio")
--- a/server.py
+++ b/server.py
@@ -935,7 +935,12 @@ parser.add_argument("--max-content-length", help="Set the max")
 parser.add_argument("--rvc-save-file", action="store_true", help="Save the last rvc input/output audio file into data/tmp/ folder (for research)")

 parser.add_argument("--stt-vosk-model-path", help="Load a custom vosk speech-to-text model")
-parser.add_argument("--stt-whisper-model-path", help="Load a custom vosk speech-to-text model")
+parser.add_argument("--stt-whisper-model-path", help="Load a custom whisper speech-to-text model")
+
+parser.add_argument("--use-faster-whisper", action="store_true", help="Choose to use faster-whisper instead of whisper")
+parser.add_argument("--faster-whisper-device", help="Choose between cpu and cuda to run faster-whisper, defaults to cuda")
+parser.add_argument("--faster-whisper-type", help="Choose faster-whisper compute type, defaults to float16")
+
 # sd_group = parser.add_mutually_exclusive_group()

 local_sd = parser.add_argument_group("sd-local")
@@ -1161,15 +1166,36 @@ if "vosk-stt" in modules:
    app.add_url_rule("/api/speech-recognition/vosk/process-audio", view_func=vosk_module.process_audio, methods=["POST"])

 if "whisper-stt" in modules:
-    print("Initializing Whisper speech-recognition (from ST request file)")
+    whisper_fast=(
+        True
+        if args.use_faster_whisper
+        else False)
+
    whisper_model_path = (
        args.stt_whisper_model_path
        if args.stt_whisper_model_path
        else None)

-    import modules.speech_recognition.whisper_module as whisper_module
+    if whisper_fast:
+
+        faster_whisper_device=(
+            args.faster_whisper_device
+            if args.faster_whisper_device
+            else "cuda")
+
+        faster_whisper_type=(
+            args.faster_whisper_type
+            if args.faster_whisper_type
+            else "float16")
+
+        print(f"Initializing Faster-Whisper speech-recognition (from ST request file) on {faster_whisper_device}")
+        import modules.speech_recognition.faster_whisper_module as whisper_module
+        whisper_module.model = whisper_module.load_model(file_path=whisper_model_path,whisper_device=faster_whisper_device,whisper_compute_type=faster_whisper_type)
+    else:
+        print("Initializing Whisper speech-recognition (from ST request file)")
+        import modules.speech_recognition.whisper_module as whisper_module
+        whisper_module.model = whisper_module.load_model(file_path=whisper_model_path)

-    whisper_module.model = whisper_module.load_model(file_path=whisper_model_path)
    app.add_url_rule("/api/speech-recognition/whisper/process-audio", view_func=whisper_module.process_audio, methods=["POST"])

 if "streaming-stt" in modules: