From e32d0c30c2162071bf6a344f42d5b3f1cb9ac23d Mon Sep 17 00:00:00 2001 From: Cohee <18619528+Cohee1207@users.noreply.github.com> Date: Sun, 19 Nov 2023 21:10:41 +0200 Subject: [PATCH] Add language parameter for whisper --- modules/speech_recognition/streaming_module.py | 5 +++-- modules/speech_recognition/vosk_module.py | 6 +++--- modules/speech_recognition/whisper_module.py | 3 ++- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/modules/speech_recognition/streaming_module.py b/modules/speech_recognition/streaming_module.py index 981e503..1ea433a 100644 --- a/modules/speech_recognition/streaming_module.py +++ b/modules/speech_recognition/streaming_module.py @@ -15,7 +15,7 @@ References: - oobabooga text-generation-webui github: https://github.com/oobabooga/text-generation-webui - vosk github: https://github.com/alphacep/vosk-api/blob/master/python/example/test_microphone.py """ -from flask import jsonify, abort +from flask import jsonify, abort, request import queue import sys @@ -77,6 +77,7 @@ def record_and_transcript(): q.put(bytes(indata)) try: + language = request.form.get('language', default=None) device_info = sd.query_devices(device, "input") # soundfile expects an int, sounddevice provides a float: samplerate = int(device_info["default_samplerate"]) @@ -107,7 +108,7 @@ def record_and_transcript(): print(DEBUG_PREFIX, "Recorded message saved to", RECORDING_FILE_PATH) # Whisper HACK - result = whisper_model.transcribe(RECORDING_FILE_PATH, condition_on_previous_text=False) + result = whisper_model.transcribe(RECORDING_FILE_PATH, condition_on_previous_text=False, language=language) transcript = result["text"] print(DEBUG_PREFIX, "Transcripted from audio file (whisper):", transcript) # ---------------------------------- diff --git a/modules/speech_recognition/vosk_module.py b/modules/speech_recognition/vosk_module.py index 708056b..013d56e 100644 --- a/modules/speech_recognition/vosk_module.py +++ b/modules/speech_recognition/vosk_module.py @@ -44,7 +44,7 @@ def process_audio(): print(DEBUG_PREFIX,"Vosk model not initialized yet.") return "" - try: + try: file = request.files.get('AudioFile') file.save(RECORDING_FILE_PATH) @@ -67,11 +67,11 @@ def process_audio(): break if rec.AcceptWaveform(data): break - + transcript = rec.Result()[14:-3] print(DEBUG_PREFIX, "Transcripted from request audio file:", transcript) return jsonify({"transcript": transcript}) except Exception as e: # No exception observed during test but we never know print(e) - abort(500, DEBUG_PREFIX+" Exception occurs while processing audio") \ No newline at end of file + abort(500, DEBUG_PREFIX+" Exception occurs while processing audio") diff --git a/modules/speech_recognition/whisper_module.py b/modules/speech_recognition/whisper_module.py index 6289bf0..056b849 100644 --- a/modules/speech_recognition/whisper_module.py +++ b/modules/speech_recognition/whisper_module.py @@ -43,9 +43,10 @@ def process_audio(): try: file = request.files.get('AudioFile') + language = request.form.get('language', default=None) file.save(RECORDING_FILE_PATH) - result = model.transcribe(RECORDING_FILE_PATH, condition_on_previous_text=False) + result = model.transcribe(RECORDING_FILE_PATH, condition_on_previous_text=False, language=language) transcript = result["text"] print(DEBUG_PREFIX, "Transcripted from audio file (whisper):", transcript)