diff --git a/.gitignore b/.gitignore index e385523..61324d0 100644 --- a/.gitignore +++ b/.gitignore @@ -141,3 +141,4 @@ api_key.txt stt_test.wav talkinghead/tha3/models docker/cache +launch.bat diff --git a/README.md b/README.md index 3767cea..c1dd9b4 100644 --- a/README.md +++ b/README.md @@ -230,6 +230,19 @@ cd SillyTavern-extras | `--sd-remote-ssl` | Use SSL for the remote SD backend
Default: **False** | | `--sd-remote-auth` | Specify the `username:password` for the remote SD backend (if required) | +## For faster-whisper instead of whisper when using whisper(extras) +1. Install CUDA 12 and cuDNN 8 +2. Install faster-whisper requirements +``` +pip install -r requirements-faster-whisper.txt +``` +use the `--use-faster-whisper` argument to switch to faster-whisper + +Optional: + +1. use the `--faster-whisper-type` argument to change compute mode for faster-whisper(ex. `--faster-whisper-type=int8`) +2. use the `--faster-whisper-cpu` argument to switch computing device to cpu + ## Coqui TTS ### Running on Mac M1 diff --git a/modules/speech_recognition/faster_whisper_module.py b/modules/speech_recognition/faster_whisper_module.py new file mode 100644 index 0000000..c571257 --- /dev/null +++ b/modules/speech_recognition/faster_whisper_module.py @@ -0,0 +1,66 @@ +""" +Speech-to-text module based on Whisper for SillyTavern Extras + - Whisper github: https://github.com/openai/whisper + +Authors: + - Tony Ribeiro (https://github.com/Tony-sama) + +Models are saved into user cache folder, example: C:/Users/toto/.cache/whisper + +References: + - Code adapted from: + - whisper github: https://github.com/openai/whisper + - oobabooga text-generation-webui github: https://github.com/oobabooga/text-generation-webui +""" +from flask import jsonify, abort, request + +from faster_whisper import WhisperModel + +DEBUG_PREFIX = "" +RECORDING_FILE_PATH = "stt_test.wav" + +model_size = "large-v3-turbo" + + +def load_model(file_path=None,whisper_device="cuda",whisper_compute_type="auto"): + """ + Load given vosk model from file or default to en-us model. + Download model to user cache folder, example: C:/Users/toto/.cache/vosk + """ + if whisper_compute_type=="auto": + whisper_compute_type=( + "int8" + if whisper_device=="cpu" + else "float16") + + if file_path is None: + print(f"faster-whisper using {model_size} model with {whisper_compute_type}") + return WhisperModel(model_size, device=whisper_device, compute_type=whisper_compute_type) + else: + print(f"faster-whisper using {file_path} model with {whisper_compute_type}") + return WhisperModel(file_path, device=whisper_device, compute_type=whisper_compute_type) + +def process_audio(): + """ + Transcript request audio file to text using Whisper + """ + + if model is None: + print(DEBUG_PREFIX,"Whisper model not initialized yet.") + return "" + + try: + file = request.files.get('AudioFile') + language = request.form.get('language', default=None) + file.save(RECORDING_FILE_PATH) + segments, info = model.transcribe(RECORDING_FILE_PATH, beam_size=5) + transcript="" + for segment in segments: + transcript=transcript+" "+segment.text + print(DEBUG_PREFIX, "Transcripted from audio file (whisper):", transcript) + + return jsonify({"transcript": transcript}) + + except Exception as e: # No exception observed during test but we never know + print(e) + abort(500, DEBUG_PREFIX+" Exception occurs while processing audio") diff --git a/requirements-faster-whisper.txt b/requirements-faster-whisper.txt new file mode 100644 index 0000000..76a9c65 --- /dev/null +++ b/requirements-faster-whisper.txt @@ -0,0 +1,7 @@ +ctranslate2==4.4.0 +huggingface_hub>=0.13 +tokenizers>=0.13,<1 +onnxruntime>=1.14,<2 +av>=11 +tqdm +faster-whisper diff --git a/server.py b/server.py index 48b7d36..e821bd6 100644 --- a/server.py +++ b/server.py @@ -935,7 +935,12 @@ parser.add_argument("--max-content-length", help="Set the max") parser.add_argument("--rvc-save-file", action="store_true", help="Save the last rvc input/output audio file into data/tmp/ folder (for research)") parser.add_argument("--stt-vosk-model-path", help="Load a custom vosk speech-to-text model") -parser.add_argument("--stt-whisper-model-path", help="Load a custom vosk speech-to-text model") +parser.add_argument("--stt-whisper-model-path", help="Load a custom whisper speech-to-text model") + +parser.add_argument("--use-faster-whisper", action="store_true", help="Choose to use faster-whisper instead of whisper") +parser.add_argument("--faster-whisper-cpu", action="store_true", help="Use cpu to run faster-whisper, saves VRAM but much slower") +parser.add_argument("--faster-whisper-type", help="Choose faster-whisper compute type, defaults to float16 for cuda and int8 for cpu") + # sd_group = parser.add_mutually_exclusive_group() local_sd = parser.add_argument_group("sd-local") @@ -1161,15 +1166,36 @@ if "vosk-stt" in modules: app.add_url_rule("/api/speech-recognition/vosk/process-audio", view_func=vosk_module.process_audio, methods=["POST"]) if "whisper-stt" in modules: - print("Initializing Whisper speech-recognition (from ST request file)") + whisper_fast=( + True + if args.use_faster_whisper + else False) + whisper_model_path = ( args.stt_whisper_model_path if args.stt_whisper_model_path else None) - import modules.speech_recognition.whisper_module as whisper_module + if whisper_fast: + + faster_whisper_device=( + "cpu" + if args.faster_whisper_cpu + else "cuda") + + faster_whisper_type=( + args.faster_whisper_type + if args.faster_whisper_type + else "auto") + + print(f"Initializing Faster-Whisper speech-recognition (from ST request file) on {faster_whisper_device}") + import modules.speech_recognition.faster_whisper_module as whisper_module + whisper_module.model = whisper_module.load_model(file_path=whisper_model_path,whisper_device=faster_whisper_device,whisper_compute_type=faster_whisper_type) + else: + print("Initializing Whisper speech-recognition (from ST request file)") + import modules.speech_recognition.whisper_module as whisper_module + whisper_module.model = whisper_module.load_model(file_path=whisper_model_path) - whisper_module.model = whisper_module.load_model(file_path=whisper_model_path) app.add_url_rule("/api/speech-recognition/whisper/process-audio", view_func=whisper_module.process_audio, methods=["POST"]) if "streaming-stt" in modules: