mirror of
https://github.com/SillyTavern/SillyTavern-Extras.git
synced 2026-01-26 17:20:04 +00:00
Merge pull request #247 from ArrangingFear56/main
Implementation for faster-whisper
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -141,3 +141,4 @@ api_key.txt
|
||||
stt_test.wav
|
||||
talkinghead/tha3/models
|
||||
docker/cache
|
||||
launch.bat
|
||||
|
||||
13
README.md
13
README.md
@@ -230,6 +230,19 @@ cd SillyTavern-extras
|
||||
| `--sd-remote-ssl` | Use SSL for the remote SD backend<br>Default: **False** |
|
||||
| `--sd-remote-auth` | Specify the `username:password` for the remote SD backend (if required) |
|
||||
|
||||
## For faster-whisper instead of whisper when using whisper(extras)
|
||||
1. Install CUDA 12 and cuDNN 8
|
||||
2. Install faster-whisper requirements
|
||||
```
|
||||
pip install -r requirements-faster-whisper.txt
|
||||
```
|
||||
use the `--use-faster-whisper` argument to switch to faster-whisper
|
||||
|
||||
Optional:
|
||||
|
||||
1. use the `--faster-whisper-type` argument to change compute mode for faster-whisper(ex. `--faster-whisper-type=int8`)
|
||||
2. use the `--faster-whisper-cpu` argument to switch computing device to cpu
|
||||
|
||||
## Coqui TTS
|
||||
|
||||
### Running on Mac M1
|
||||
|
||||
66
modules/speech_recognition/faster_whisper_module.py
Normal file
66
modules/speech_recognition/faster_whisper_module.py
Normal file
@@ -0,0 +1,66 @@
|
||||
"""
|
||||
Speech-to-text module based on Whisper for SillyTavern Extras
|
||||
- Whisper github: https://github.com/openai/whisper
|
||||
|
||||
Authors:
|
||||
- Tony Ribeiro (https://github.com/Tony-sama)
|
||||
|
||||
Models are saved into user cache folder, example: C:/Users/toto/.cache/whisper
|
||||
|
||||
References:
|
||||
- Code adapted from:
|
||||
- whisper github: https://github.com/openai/whisper
|
||||
- oobabooga text-generation-webui github: https://github.com/oobabooga/text-generation-webui
|
||||
"""
|
||||
from flask import jsonify, abort, request
|
||||
|
||||
from faster_whisper import WhisperModel
|
||||
|
||||
DEBUG_PREFIX = "<stt whisper module>"
|
||||
RECORDING_FILE_PATH = "stt_test.wav"
|
||||
|
||||
model_size = "large-v3-turbo"
|
||||
|
||||
|
||||
def load_model(file_path=None,whisper_device="cuda",whisper_compute_type="auto"):
|
||||
"""
|
||||
Load given vosk model from file or default to en-us model.
|
||||
Download model to user cache folder, example: C:/Users/toto/.cache/vosk
|
||||
"""
|
||||
if whisper_compute_type=="auto":
|
||||
whisper_compute_type=(
|
||||
"int8"
|
||||
if whisper_device=="cpu"
|
||||
else "float16")
|
||||
|
||||
if file_path is None:
|
||||
print(f"faster-whisper using {model_size} model with {whisper_compute_type}")
|
||||
return WhisperModel(model_size, device=whisper_device, compute_type=whisper_compute_type)
|
||||
else:
|
||||
print(f"faster-whisper using {file_path} model with {whisper_compute_type}")
|
||||
return WhisperModel(file_path, device=whisper_device, compute_type=whisper_compute_type)
|
||||
|
||||
def process_audio():
|
||||
"""
|
||||
Transcript request audio file to text using Whisper
|
||||
"""
|
||||
|
||||
if model is None:
|
||||
print(DEBUG_PREFIX,"Whisper model not initialized yet.")
|
||||
return ""
|
||||
|
||||
try:
|
||||
file = request.files.get('AudioFile')
|
||||
language = request.form.get('language', default=None)
|
||||
file.save(RECORDING_FILE_PATH)
|
||||
segments, info = model.transcribe(RECORDING_FILE_PATH, beam_size=5)
|
||||
transcript=""
|
||||
for segment in segments:
|
||||
transcript=transcript+" "+segment.text
|
||||
print(DEBUG_PREFIX, "Transcripted from audio file (whisper):", transcript)
|
||||
|
||||
return jsonify({"transcript": transcript})
|
||||
|
||||
except Exception as e: # No exception observed during test but we never know
|
||||
print(e)
|
||||
abort(500, DEBUG_PREFIX+" Exception occurs while processing audio")
|
||||
7
requirements-faster-whisper.txt
Normal file
7
requirements-faster-whisper.txt
Normal file
@@ -0,0 +1,7 @@
|
||||
ctranslate2==4.4.0
|
||||
huggingface_hub>=0.13
|
||||
tokenizers>=0.13,<1
|
||||
onnxruntime>=1.14,<2
|
||||
av>=11
|
||||
tqdm
|
||||
faster-whisper
|
||||
34
server.py
34
server.py
@@ -935,7 +935,12 @@ parser.add_argument("--max-content-length", help="Set the max")
|
||||
parser.add_argument("--rvc-save-file", action="store_true", help="Save the last rvc input/output audio file into data/tmp/ folder (for research)")
|
||||
|
||||
parser.add_argument("--stt-vosk-model-path", help="Load a custom vosk speech-to-text model")
|
||||
parser.add_argument("--stt-whisper-model-path", help="Load a custom vosk speech-to-text model")
|
||||
parser.add_argument("--stt-whisper-model-path", help="Load a custom whisper speech-to-text model")
|
||||
|
||||
parser.add_argument("--use-faster-whisper", action="store_true", help="Choose to use faster-whisper instead of whisper")
|
||||
parser.add_argument("--faster-whisper-cpu", action="store_true", help="Use cpu to run faster-whisper, saves VRAM but much slower")
|
||||
parser.add_argument("--faster-whisper-type", help="Choose faster-whisper compute type, defaults to float16 for cuda and int8 for cpu")
|
||||
|
||||
# sd_group = parser.add_mutually_exclusive_group()
|
||||
|
||||
local_sd = parser.add_argument_group("sd-local")
|
||||
@@ -1161,15 +1166,36 @@ if "vosk-stt" in modules:
|
||||
app.add_url_rule("/api/speech-recognition/vosk/process-audio", view_func=vosk_module.process_audio, methods=["POST"])
|
||||
|
||||
if "whisper-stt" in modules:
|
||||
print("Initializing Whisper speech-recognition (from ST request file)")
|
||||
whisper_fast=(
|
||||
True
|
||||
if args.use_faster_whisper
|
||||
else False)
|
||||
|
||||
whisper_model_path = (
|
||||
args.stt_whisper_model_path
|
||||
if args.stt_whisper_model_path
|
||||
else None)
|
||||
|
||||
import modules.speech_recognition.whisper_module as whisper_module
|
||||
if whisper_fast:
|
||||
|
||||
faster_whisper_device=(
|
||||
"cpu"
|
||||
if args.faster_whisper_cpu
|
||||
else "cuda")
|
||||
|
||||
faster_whisper_type=(
|
||||
args.faster_whisper_type
|
||||
if args.faster_whisper_type
|
||||
else "auto")
|
||||
|
||||
print(f"Initializing Faster-Whisper speech-recognition (from ST request file) on {faster_whisper_device}")
|
||||
import modules.speech_recognition.faster_whisper_module as whisper_module
|
||||
whisper_module.model = whisper_module.load_model(file_path=whisper_model_path,whisper_device=faster_whisper_device,whisper_compute_type=faster_whisper_type)
|
||||
else:
|
||||
print("Initializing Whisper speech-recognition (from ST request file)")
|
||||
import modules.speech_recognition.whisper_module as whisper_module
|
||||
whisper_module.model = whisper_module.load_model(file_path=whisper_model_path)
|
||||
|
||||
whisper_module.model = whisper_module.load_model(file_path=whisper_model_path)
|
||||
app.add_url_rule("/api/speech-recognition/whisper/process-audio", view_func=whisper_module.process_audio, methods=["POST"])
|
||||
|
||||
if "streaming-stt" in modules:
|
||||
|
||||
Reference in New Issue
Block a user