mirror of
https://github.com/SillyTavern/SillyTavern-Extras.git
synced 2026-04-23 16:08:57 +00:00
merge speech recoginition module with neo branch.
This commit is contained in:
77
modules/speech_recognition/vosk_module.py
Normal file
77
modules/speech_recognition/vosk_module.py
Normal file
@@ -0,0 +1,77 @@
|
||||
"""
|
||||
Speech-to-text module based on Vosk for SillyTavern Extras
|
||||
- Vosk website: https://alphacephei.com/vosk/
|
||||
- Vosk api: https://github.com/alphacep/vosk-api
|
||||
|
||||
Authors:
|
||||
- Tony Ribeiro (https://github.com/Tony-sama)
|
||||
|
||||
Models are saved into user cache folder, example: C:/Users/toto/.cache/vosk
|
||||
|
||||
References:
|
||||
- Code adapted from: https://github.com/alphacep/vosk-api/blob/master/python/example/test_simple.py
|
||||
"""
|
||||
from flask import jsonify, abort, request
|
||||
|
||||
import wave
|
||||
from vosk import Model, KaldiRecognizer, SetLogLevel
|
||||
import soundfile
|
||||
|
||||
DEBUG_PREFIX = "<stt vosk module>"
|
||||
RECORDING_FILE_PATH = "stt_test.wav"
|
||||
|
||||
model = None
|
||||
|
||||
SetLogLevel(-1)
|
||||
|
||||
def load_model(file_path=None):
|
||||
"""
|
||||
Load given vosk model from file or default to en-us model.
|
||||
Download model to user cache folder, example: C:/Users/toto/.cache/vosk
|
||||
"""
|
||||
|
||||
if file_path is None:
|
||||
return Model(lang="en-us")
|
||||
else:
|
||||
return Model(file_path)
|
||||
|
||||
def process_audio():
|
||||
"""
|
||||
Transcript request audio file to text using Whisper
|
||||
"""
|
||||
|
||||
if model is None:
|
||||
print(DEBUG_PREFIX,"Vosk model not initialized yet.")
|
||||
return ""
|
||||
|
||||
try:
|
||||
file = request.files.get('AudioFile')
|
||||
file.save(RECORDING_FILE_PATH)
|
||||
|
||||
# Read and rewrite the file with soundfile
|
||||
data, samplerate = soundfile.read(RECORDING_FILE_PATH)
|
||||
soundfile.write(RECORDING_FILE_PATH, data, samplerate)
|
||||
|
||||
wf = wave.open(RECORDING_FILE_PATH, "rb")
|
||||
if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype() != "NONE":
|
||||
print("Audio file must be WAV format mono PCM.")
|
||||
abort(500, DEBUG_PREFIX+" Audio file must be WAV format mono PCM.")
|
||||
|
||||
rec = KaldiRecognizer(model, wf.getframerate())
|
||||
#rec.SetWords(True)
|
||||
#rec.SetPartialWords(True)
|
||||
|
||||
while True:
|
||||
data = wf.readframes(4000)
|
||||
if len(data) == 0:
|
||||
break
|
||||
if rec.AcceptWaveform(data):
|
||||
break
|
||||
|
||||
transcript = rec.Result()[14:-3]
|
||||
print(DEBUG_PREFIX, "Transcripted from request audio file:", transcript)
|
||||
return jsonify({"transcript": transcript})
|
||||
|
||||
except Exception as e: # No exception observed during test but we never know
|
||||
print(e)
|
||||
abort(500, DEBUG_PREFIX+" Exception occurs while processing audio")
|
||||
57
modules/speech_recognition/whisper_module.py
Normal file
57
modules/speech_recognition/whisper_module.py
Normal file
@@ -0,0 +1,57 @@
|
||||
"""
|
||||
Speech-to-text module based on Whisper for SillyTavern Extras
|
||||
- Whisper github: https://github.com/openai/whisper
|
||||
|
||||
Authors:
|
||||
- Tony Ribeiro (https://github.com/Tony-sama)
|
||||
|
||||
Models are saved into user cache folder, example: C:/Users/toto/.cache/whisper
|
||||
|
||||
References:
|
||||
- Code adapted from:
|
||||
- whisper github: https://github.com/openai/whisper
|
||||
- oobabooga text-generation-webui github: https://github.com/oobabooga/text-generation-webui
|
||||
"""
|
||||
from random import sample
|
||||
from flask import jsonify, abort, request
|
||||
|
||||
import whisper
|
||||
|
||||
DEBUG_PREFIX = "<stt whisper module>"
|
||||
RECORDING_FILE_PATH = "stt_test.wav"
|
||||
|
||||
model = None
|
||||
|
||||
def load_model(file_path=None):
|
||||
"""
|
||||
Load given vosk model from file or default to en-us model.
|
||||
Download model to user cache folder, example: C:/Users/toto/.cache/vosk
|
||||
"""
|
||||
|
||||
if file_path is None:
|
||||
return whisper.load_model("base.en")
|
||||
else:
|
||||
return whisper.load_model(file_path)
|
||||
|
||||
def process_audio():
|
||||
"""
|
||||
Transcript request audio file to text using Whisper
|
||||
"""
|
||||
|
||||
if model is None:
|
||||
print(DEBUG_PREFIX,"Whisper model not initialized yet.")
|
||||
return ""
|
||||
|
||||
try:
|
||||
file = request.files.get('AudioFile')
|
||||
file.save(RECORDING_FILE_PATH)
|
||||
|
||||
result = model.transcribe(RECORDING_FILE_PATH)
|
||||
transcript = result["text"]
|
||||
print(DEBUG_PREFIX, "Transcripted from audio file (whisper):", transcript)
|
||||
|
||||
return jsonify({"transcript": transcript})
|
||||
|
||||
except Exception as e: # No exception observed during test but we never know
|
||||
print(e)
|
||||
abort(500, DEBUG_PREFIX+" Exception occurs while processing audio")
|
||||
@@ -18,3 +18,6 @@ chromadb
|
||||
sentence_transformers
|
||||
edge-tts
|
||||
TTS
|
||||
vosk
|
||||
sounddevice
|
||||
openai-whisper
|
||||
25
server.py
25
server.py
@@ -84,6 +84,8 @@ parser.add_argument(
|
||||
)
|
||||
parser.add_argument("--coqui-gpu", action="store_false", help="Run the voice models on the GPU (CPU is default)")
|
||||
parser.add_argument("--coqui-model", help="Load a custom Coqui TTS model")
|
||||
parser.add_argument("--stt-vosk-model-path", help="Load a custom vosk speech-to-text model")
|
||||
parser.add_argument("--stt-whisper-model-path", help="Load a custom vosk speech-to-text model")
|
||||
sd_group = parser.add_mutually_exclusive_group()
|
||||
|
||||
local_sd = sd_group.add_argument_group("sd-local")
|
||||
@@ -309,6 +311,29 @@ CORS(app) # allow cross-domain requests
|
||||
Compress(app) # compress responses
|
||||
app.config["MAX_CONTENT_LENGTH"] = 100 * 1024 * 1024
|
||||
|
||||
if "vosk-stt" in modules:
|
||||
print("Initializing Vosk STT streaming")
|
||||
vosk_model_path = (
|
||||
args.stt_vosk_model_path
|
||||
if args.stt_vosk_model_path
|
||||
else None)
|
||||
|
||||
import modules.speech_recognition.vosk_module as vosk_module
|
||||
|
||||
vosk_module.model = vosk_module.load_model(file_path=vosk_model_path)
|
||||
app.add_url_rule("/api/speech-recognition/vosk/process-audio", view_func=vosk_module.process_audio, methods=["POST"])
|
||||
|
||||
if "whisper-stt" in modules:
|
||||
print("Initializing Whisper STT streaming")
|
||||
whisper_model_path = (
|
||||
args.stt_whisper_model_path
|
||||
if args.stt_whisper_model_path
|
||||
else None)
|
||||
|
||||
import modules.speech_recognition.whisper_module as whisper_module
|
||||
|
||||
whisper_module.model = whisper_module.load_model(file_path=whisper_model_path)
|
||||
app.add_url_rule("/api/speech-recognition/whisper/process-audio", view_func=whisper_module.process_audio, methods=["POST"])
|
||||
|
||||
def require_module(name):
|
||||
def wrapper(fn):
|
||||
|
||||
Reference in New Issue
Block a user