diff --git a/.gitignore b/.gitignore
index e385523..61324d0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -141,3 +141,4 @@ api_key.txt
stt_test.wav
talkinghead/tha3/models
docker/cache
+launch.bat
diff --git a/README.md b/README.md
index 3767cea..c1dd9b4 100644
--- a/README.md
+++ b/README.md
@@ -230,6 +230,19 @@ cd SillyTavern-extras
| `--sd-remote-ssl` | Use SSL for the remote SD backend
Default: **False** |
| `--sd-remote-auth` | Specify the `username:password` for the remote SD backend (if required) |
+## For faster-whisper instead of whisper when using whisper(extras)
+1. Install CUDA 12 and cuDNN 8
+2. Install faster-whisper requirements
+```
+pip install -r requirements-faster-whisper.txt
+```
+use the `--use-faster-whisper` argument to switch to faster-whisper
+
+Optional:
+
+1. use the `--faster-whisper-type` argument to change compute mode for faster-whisper(ex. `--faster-whisper-type=int8`)
+2. use the `--faster-whisper-cpu` argument to switch computing device to cpu
+
## Coqui TTS
### Running on Mac M1
diff --git a/modules/speech_recognition/faster_whisper_module.py b/modules/speech_recognition/faster_whisper_module.py
new file mode 100644
index 0000000..c571257
--- /dev/null
+++ b/modules/speech_recognition/faster_whisper_module.py
@@ -0,0 +1,66 @@
+"""
+Speech-to-text module based on Whisper for SillyTavern Extras
+ - Whisper github: https://github.com/openai/whisper
+
+Authors:
+ - Tony Ribeiro (https://github.com/Tony-sama)
+
+Models are saved into user cache folder, example: C:/Users/toto/.cache/whisper
+
+References:
+ - Code adapted from:
+ - whisper github: https://github.com/openai/whisper
+ - oobabooga text-generation-webui github: https://github.com/oobabooga/text-generation-webui
+"""
+from flask import jsonify, abort, request
+
+from faster_whisper import WhisperModel
+
+DEBUG_PREFIX = ""
+RECORDING_FILE_PATH = "stt_test.wav"
+
+model_size = "large-v3-turbo"
+
+
+def load_model(file_path=None,whisper_device="cuda",whisper_compute_type="auto"):
+ """
+ Load given vosk model from file or default to en-us model.
+ Download model to user cache folder, example: C:/Users/toto/.cache/vosk
+ """
+ if whisper_compute_type=="auto":
+ whisper_compute_type=(
+ "int8"
+ if whisper_device=="cpu"
+ else "float16")
+
+ if file_path is None:
+ print(f"faster-whisper using {model_size} model with {whisper_compute_type}")
+ return WhisperModel(model_size, device=whisper_device, compute_type=whisper_compute_type)
+ else:
+ print(f"faster-whisper using {file_path} model with {whisper_compute_type}")
+ return WhisperModel(file_path, device=whisper_device, compute_type=whisper_compute_type)
+
+def process_audio():
+ """
+ Transcript request audio file to text using Whisper
+ """
+
+ if model is None:
+ print(DEBUG_PREFIX,"Whisper model not initialized yet.")
+ return ""
+
+ try:
+ file = request.files.get('AudioFile')
+ language = request.form.get('language', default=None)
+ file.save(RECORDING_FILE_PATH)
+ segments, info = model.transcribe(RECORDING_FILE_PATH, beam_size=5)
+ transcript=""
+ for segment in segments:
+ transcript=transcript+" "+segment.text
+ print(DEBUG_PREFIX, "Transcripted from audio file (whisper):", transcript)
+
+ return jsonify({"transcript": transcript})
+
+ except Exception as e: # No exception observed during test but we never know
+ print(e)
+ abort(500, DEBUG_PREFIX+" Exception occurs while processing audio")
diff --git a/requirements-faster-whisper.txt b/requirements-faster-whisper.txt
new file mode 100644
index 0000000..76a9c65
--- /dev/null
+++ b/requirements-faster-whisper.txt
@@ -0,0 +1,7 @@
+ctranslate2==4.4.0
+huggingface_hub>=0.13
+tokenizers>=0.13,<1
+onnxruntime>=1.14,<2
+av>=11
+tqdm
+faster-whisper
diff --git a/server.py b/server.py
index 48b7d36..e821bd6 100644
--- a/server.py
+++ b/server.py
@@ -935,7 +935,12 @@ parser.add_argument("--max-content-length", help="Set the max")
parser.add_argument("--rvc-save-file", action="store_true", help="Save the last rvc input/output audio file into data/tmp/ folder (for research)")
parser.add_argument("--stt-vosk-model-path", help="Load a custom vosk speech-to-text model")
-parser.add_argument("--stt-whisper-model-path", help="Load a custom vosk speech-to-text model")
+parser.add_argument("--stt-whisper-model-path", help="Load a custom whisper speech-to-text model")
+
+parser.add_argument("--use-faster-whisper", action="store_true", help="Choose to use faster-whisper instead of whisper")
+parser.add_argument("--faster-whisper-cpu", action="store_true", help="Use cpu to run faster-whisper, saves VRAM but much slower")
+parser.add_argument("--faster-whisper-type", help="Choose faster-whisper compute type, defaults to float16 for cuda and int8 for cpu")
+
# sd_group = parser.add_mutually_exclusive_group()
local_sd = parser.add_argument_group("sd-local")
@@ -1161,15 +1166,36 @@ if "vosk-stt" in modules:
app.add_url_rule("/api/speech-recognition/vosk/process-audio", view_func=vosk_module.process_audio, methods=["POST"])
if "whisper-stt" in modules:
- print("Initializing Whisper speech-recognition (from ST request file)")
+ whisper_fast=(
+ True
+ if args.use_faster_whisper
+ else False)
+
whisper_model_path = (
args.stt_whisper_model_path
if args.stt_whisper_model_path
else None)
- import modules.speech_recognition.whisper_module as whisper_module
+ if whisper_fast:
+
+ faster_whisper_device=(
+ "cpu"
+ if args.faster_whisper_cpu
+ else "cuda")
+
+ faster_whisper_type=(
+ args.faster_whisper_type
+ if args.faster_whisper_type
+ else "auto")
+
+ print(f"Initializing Faster-Whisper speech-recognition (from ST request file) on {faster_whisper_device}")
+ import modules.speech_recognition.faster_whisper_module as whisper_module
+ whisper_module.model = whisper_module.load_model(file_path=whisper_model_path,whisper_device=faster_whisper_device,whisper_compute_type=faster_whisper_type)
+ else:
+ print("Initializing Whisper speech-recognition (from ST request file)")
+ import modules.speech_recognition.whisper_module as whisper_module
+ whisper_module.model = whisper_module.load_model(file_path=whisper_model_path)
- whisper_module.model = whisper_module.load_model(file_path=whisper_model_path)
app.add_url_rule("/api/speech-recognition/whisper/process-audio", view_func=whisper_module.process_audio, methods=["POST"])
if "streaming-stt" in modules: