mirror of
https://github.com/SillyTavern/SillyTavern-Extras.git
synced 2026-04-26 01:18:59 +00:00
Restored speech recognition streaming mode as an independant module. Perform audio recording using mic on server side, detect voice start/end with vosk and transcript with whisper.
This commit is contained in:
121
modules/speech_recognition/streaming_module.py
Normal file
121
modules/speech_recognition/streaming_module.py
Normal file
@@ -0,0 +1,121 @@
|
||||
"""
|
||||
Speech-to-text module based on Vosk and Whisper for SillyTavern Extras
|
||||
- Vosk website: https://alphacephei.com/vosk/
|
||||
- Vosk api: https://github.com/alphacep/vosk-api
|
||||
- Whisper github: https://github.com/openai/whisper
|
||||
|
||||
Authors:
|
||||
- Tony Ribeiro (https://github.com/Tony-sama)
|
||||
|
||||
Models are saved into user cache folder, example: C:/Users/toto/.cache/whisper and C:/Users/toto/.cache/vosk
|
||||
|
||||
References:
|
||||
- Code adapted from:
|
||||
- whisper github: https://github.com/openai/whisper
|
||||
- oobabooga text-generation-webui github: https://github.com/oobabooga/text-generation-webui
|
||||
- vosk github: https://github.com/alphacep/vosk-api/blob/master/python/example/test_microphone.py
|
||||
"""
|
||||
from flask import jsonify, abort
|
||||
|
||||
import queue
|
||||
import sys
|
||||
import sounddevice as sd
|
||||
import soundfile as sf
|
||||
import io
|
||||
import numpy as np
|
||||
from scipy.io.wavfile import write
|
||||
|
||||
import vosk
|
||||
import whisper
|
||||
|
||||
DEBUG_PREFIX = "<stt streaming module>"
|
||||
RECORDING_FILE_PATH = "stt_test.wav"
|
||||
|
||||
whisper_model = None
|
||||
vosk_model = None
|
||||
device = None
|
||||
|
||||
def load_model(file_path=None):
|
||||
"""
|
||||
Load given vosk model from file or default to en-us model.
|
||||
Download model to user cache folder, example: C:/Users/toto/.cache/vosk
|
||||
"""
|
||||
|
||||
if file_path is None:
|
||||
return (whisper.load_model("base.en"), vosk.Model(lang="en-us"))
|
||||
else:
|
||||
return (whisper.load_model(file_path), vosk.Model(lang="en-us"))
|
||||
|
||||
def convert_bytearray_to_wav_ndarray(input_bytearray: bytes, sampling_rate=16000):
|
||||
"""
|
||||
Convert a bytearray to wav format to output in a file for quality check debuging
|
||||
"""
|
||||
bytes_wav = bytes()
|
||||
byte_io = io.BytesIO(bytes_wav)
|
||||
write(byte_io, sampling_rate, np.frombuffer(input_bytearray, dtype=np.int16))
|
||||
output_wav = byte_io.read()
|
||||
output, _ = sf.read(io.BytesIO(output_wav))
|
||||
return output
|
||||
|
||||
def record_and_transcript():
|
||||
"""
|
||||
Continuously record from mic and transcript voice.
|
||||
Return the transcript once no more voice is detected.
|
||||
"""
|
||||
if whisper_model is None:
|
||||
print(DEBUG_PREFIX,"Whisper model not initialized yet.")
|
||||
return ""
|
||||
|
||||
q = queue.Queue()
|
||||
stream_errors = list()
|
||||
|
||||
def callback(indata, frames, time, status):
|
||||
"""This is called (from a separate thread) for each audio block."""
|
||||
if status:
|
||||
print(status, file=sys.stderr)
|
||||
stream_errors.append(status)
|
||||
q.put(bytes(indata))
|
||||
|
||||
try:
|
||||
device_info = sd.query_devices(device, "input")
|
||||
# soundfile expects an int, sounddevice provides a float:
|
||||
samplerate = int(device_info["default_samplerate"])
|
||||
|
||||
print(DEBUG_PREFIX, "Start recording from:", device_info["name"], "with samplerate", samplerate)
|
||||
|
||||
with sd.RawInputStream(samplerate=samplerate, blocksize = 8000, device=device, dtype="int16", channels=1, callback=callback):
|
||||
|
||||
rec = vosk.KaldiRecognizer(vosk_model, samplerate)
|
||||
full_recording = bytearray()
|
||||
while True:
|
||||
data = q.get()
|
||||
if len(stream_errors) > 0:
|
||||
raise Exception(DEBUG_PREFIX+" Stream errors: "+str(stream_errors))
|
||||
|
||||
full_recording.extend(data)
|
||||
|
||||
if rec.AcceptWaveform(data):
|
||||
# Extract transcript string
|
||||
transcript = rec.Result()[14:-3]
|
||||
print(DEBUG_PREFIX, "Transcripted from microphone stream (vosk):", transcript)
|
||||
|
||||
# ----------------------------------
|
||||
# DEBUG: save recording to wav file
|
||||
# ----------------------------------
|
||||
output_file = convert_bytearray_to_wav_ndarray(input_bytearray=full_recording, sampling_rate=samplerate)
|
||||
sf.write(file=RECORDING_FILE_PATH, data=output_file, samplerate=samplerate)
|
||||
print(DEBUG_PREFIX, "Recorded message saved to", RECORDING_FILE_PATH)
|
||||
|
||||
# Whisper HACK
|
||||
result = whisper_model.transcribe(RECORDING_FILE_PATH)
|
||||
transcript = result["text"]
|
||||
print(DEBUG_PREFIX, "Transcripted from audio file (whisper):", transcript)
|
||||
# ----------------------------------
|
||||
|
||||
return jsonify({"transcript": transcript})
|
||||
#else:
|
||||
# print(rec.PartialResult())
|
||||
|
||||
except Exception as e: # No exception observed during test but we never know
|
||||
print(e)
|
||||
abort(500, DEBUG_PREFIX+" Exception occurs while recording")
|
||||
@@ -12,7 +12,6 @@ References:
|
||||
- whisper github: https://github.com/openai/whisper
|
||||
- oobabooga text-generation-webui github: https://github.com/oobabooga/text-generation-webui
|
||||
"""
|
||||
from random import sample
|
||||
from flask import jsonify, abort, request
|
||||
|
||||
import whisper
|
||||
|
||||
16
server.py
16
server.py
@@ -305,7 +305,7 @@ Compress(app) # compress responses
|
||||
app.config["MAX_CONTENT_LENGTH"] = 100 * 1024 * 1024
|
||||
|
||||
if "vosk-stt" in modules:
|
||||
print("Initializing Vosk STT streaming")
|
||||
print("Initializing Vosk speech-recognition (from ST request file)")
|
||||
vosk_model_path = (
|
||||
args.stt_vosk_model_path
|
||||
if args.stt_vosk_model_path
|
||||
@@ -317,7 +317,7 @@ if "vosk-stt" in modules:
|
||||
app.add_url_rule("/api/speech-recognition/vosk/process-audio", view_func=vosk_module.process_audio, methods=["POST"])
|
||||
|
||||
if "whisper-stt" in modules:
|
||||
print("Initializing Whisper STT streaming")
|
||||
print("Initializing Whisper speech-recognition (from ST request file)")
|
||||
whisper_model_path = (
|
||||
args.stt_whisper_model_path
|
||||
if args.stt_whisper_model_path
|
||||
@@ -328,6 +328,18 @@ if "whisper-stt" in modules:
|
||||
whisper_module.model = whisper_module.load_model(file_path=whisper_model_path)
|
||||
app.add_url_rule("/api/speech-recognition/whisper/process-audio", view_func=whisper_module.process_audio, methods=["POST"])
|
||||
|
||||
if "streaming-stt" in modules:
|
||||
print("Initializing vosk/whisper speech-recognition (from extras server microphone)")
|
||||
whisper_model_path = (
|
||||
args.stt_whisper_model_path
|
||||
if args.stt_whisper_model_path
|
||||
else None)
|
||||
|
||||
import modules.speech_recognition.streaming_module as streaming_module
|
||||
|
||||
streaming_module.whisper_model, streaming_module.vosk_model = streaming_module.load_model(file_path=whisper_model_path)
|
||||
app.add_url_rule("/api/speech-recognition/streaming/record-and-transcript", view_func=streaming_module.record_and_transcript, methods=["POST"])
|
||||
|
||||
def require_module(name):
|
||||
def wrapper(fn):
|
||||
@wraps(fn)
|
||||
|
||||
Reference in New Issue
Block a user