Restored speech recognition streaming mode as an independant module. Perform audio recording using mic on server side, detect voice start/end with vosk and transcript with whisper.

This commit is contained in:
Tony Ribeiro
2023-07-31 18:50:44 +02:00
parent 83adae950e
commit 912c4335c2
3 changed files with 135 additions and 3 deletions

View File

@@ -0,0 +1,121 @@
"""
Speech-to-text module based on Vosk and Whisper for SillyTavern Extras
- Vosk website: https://alphacephei.com/vosk/
- Vosk api: https://github.com/alphacep/vosk-api
- Whisper github: https://github.com/openai/whisper
Authors:
- Tony Ribeiro (https://github.com/Tony-sama)
Models are saved into user cache folder, example: C:/Users/toto/.cache/whisper and C:/Users/toto/.cache/vosk
References:
- Code adapted from:
- whisper github: https://github.com/openai/whisper
- oobabooga text-generation-webui github: https://github.com/oobabooga/text-generation-webui
- vosk github: https://github.com/alphacep/vosk-api/blob/master/python/example/test_microphone.py
"""
from flask import jsonify, abort
import queue
import sys
import sounddevice as sd
import soundfile as sf
import io
import numpy as np
from scipy.io.wavfile import write
import vosk
import whisper
DEBUG_PREFIX = "<stt streaming module>"
RECORDING_FILE_PATH = "stt_test.wav"
whisper_model = None
vosk_model = None
device = None
def load_model(file_path=None):
"""
Load given vosk model from file or default to en-us model.
Download model to user cache folder, example: C:/Users/toto/.cache/vosk
"""
if file_path is None:
return (whisper.load_model("base.en"), vosk.Model(lang="en-us"))
else:
return (whisper.load_model(file_path), vosk.Model(lang="en-us"))
def convert_bytearray_to_wav_ndarray(input_bytearray: bytes, sampling_rate=16000):
"""
Convert a bytearray to wav format to output in a file for quality check debuging
"""
bytes_wav = bytes()
byte_io = io.BytesIO(bytes_wav)
write(byte_io, sampling_rate, np.frombuffer(input_bytearray, dtype=np.int16))
output_wav = byte_io.read()
output, _ = sf.read(io.BytesIO(output_wav))
return output
def record_and_transcript():
"""
Continuously record from mic and transcript voice.
Return the transcript once no more voice is detected.
"""
if whisper_model is None:
print(DEBUG_PREFIX,"Whisper model not initialized yet.")
return ""
q = queue.Queue()
stream_errors = list()
def callback(indata, frames, time, status):
"""This is called (from a separate thread) for each audio block."""
if status:
print(status, file=sys.stderr)
stream_errors.append(status)
q.put(bytes(indata))
try:
device_info = sd.query_devices(device, "input")
# soundfile expects an int, sounddevice provides a float:
samplerate = int(device_info["default_samplerate"])
print(DEBUG_PREFIX, "Start recording from:", device_info["name"], "with samplerate", samplerate)
with sd.RawInputStream(samplerate=samplerate, blocksize = 8000, device=device, dtype="int16", channels=1, callback=callback):
rec = vosk.KaldiRecognizer(vosk_model, samplerate)
full_recording = bytearray()
while True:
data = q.get()
if len(stream_errors) > 0:
raise Exception(DEBUG_PREFIX+" Stream errors: "+str(stream_errors))
full_recording.extend(data)
if rec.AcceptWaveform(data):
# Extract transcript string
transcript = rec.Result()[14:-3]
print(DEBUG_PREFIX, "Transcripted from microphone stream (vosk):", transcript)
# ----------------------------------
# DEBUG: save recording to wav file
# ----------------------------------
output_file = convert_bytearray_to_wav_ndarray(input_bytearray=full_recording, sampling_rate=samplerate)
sf.write(file=RECORDING_FILE_PATH, data=output_file, samplerate=samplerate)
print(DEBUG_PREFIX, "Recorded message saved to", RECORDING_FILE_PATH)
# Whisper HACK
result = whisper_model.transcribe(RECORDING_FILE_PATH)
transcript = result["text"]
print(DEBUG_PREFIX, "Transcripted from audio file (whisper):", transcript)
# ----------------------------------
return jsonify({"transcript": transcript})
#else:
# print(rec.PartialResult())
except Exception as e: # No exception observed during test but we never know
print(e)
abort(500, DEBUG_PREFIX+" Exception occurs while recording")

View File

@@ -12,7 +12,6 @@ References:
- whisper github: https://github.com/openai/whisper
- oobabooga text-generation-webui github: https://github.com/oobabooga/text-generation-webui
"""
from random import sample
from flask import jsonify, abort, request
import whisper

View File

@@ -305,7 +305,7 @@ Compress(app) # compress responses
app.config["MAX_CONTENT_LENGTH"] = 100 * 1024 * 1024
if "vosk-stt" in modules:
print("Initializing Vosk STT streaming")
print("Initializing Vosk speech-recognition (from ST request file)")
vosk_model_path = (
args.stt_vosk_model_path
if args.stt_vosk_model_path
@@ -317,7 +317,7 @@ if "vosk-stt" in modules:
app.add_url_rule("/api/speech-recognition/vosk/process-audio", view_func=vosk_module.process_audio, methods=["POST"])
if "whisper-stt" in modules:
print("Initializing Whisper STT streaming")
print("Initializing Whisper speech-recognition (from ST request file)")
whisper_model_path = (
args.stt_whisper_model_path
if args.stt_whisper_model_path
@@ -328,6 +328,18 @@ if "whisper-stt" in modules:
whisper_module.model = whisper_module.load_model(file_path=whisper_model_path)
app.add_url_rule("/api/speech-recognition/whisper/process-audio", view_func=whisper_module.process_audio, methods=["POST"])
if "streaming-stt" in modules:
print("Initializing vosk/whisper speech-recognition (from extras server microphone)")
whisper_model_path = (
args.stt_whisper_model_path
if args.stt_whisper_model_path
else None)
import modules.speech_recognition.streaming_module as streaming_module
streaming_module.whisper_model, streaming_module.vosk_model = streaming_module.load_model(file_path=whisper_model_path)
app.add_url_rule("/api/speech-recognition/streaming/record-and-transcript", view_func=streaming_module.record_and_transcript, methods=["POST"])
def require_module(name):
def wrapper(fn):
@wraps(fn)