From 9d55444995a93aba06fae10bcff6e65d3f3e6e88 Mon Sep 17 00:00:00 2001 From: ArrangingFear56 <82667894+ArrangingFear56@users.noreply.github.com> Date: Mon, 9 Dec 2024 17:09:56 +0800 Subject: [PATCH 01/15] Add files via upload --- fasterWhisperRequirements.txt | 6 ++++ whisper_module.py | 56 +++++++++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+) create mode 100644 fasterWhisperRequirements.txt create mode 100644 whisper_module.py diff --git a/fasterWhisperRequirements.txt b/fasterWhisperRequirements.txt new file mode 100644 index 0000000..a2cb6c0 --- /dev/null +++ b/fasterWhisperRequirements.txt @@ -0,0 +1,6 @@ +ctranslate2==4.4.0 +huggingface_hub>=0.13 +tokenizers>=0.13,<1 +onnxruntime>=1.14,<2 +av>=11 +tqdm \ No newline at end of file diff --git a/whisper_module.py b/whisper_module.py new file mode 100644 index 0000000..551ac89 --- /dev/null +++ b/whisper_module.py @@ -0,0 +1,56 @@ +""" +Speech-to-text module based on Whisper for SillyTavern Extras + - Whisper github: https://github.com/openai/whisper + +Authors: + - Tony Ribeiro (https://github.com/Tony-sama) + +Models are saved into user cache folder, example: C:/Users/toto/.cache/whisper + +References: + - Code adapted from: + - whisper github: https://github.com/openai/whisper + - oobabooga text-generation-webui github: https://github.com/oobabooga/text-generation-webui +""" +from flask import jsonify, abort, request + +from faster_whisper import WhisperModel + +DEBUG_PREFIX = "" +RECORDING_FILE_PATH = "stt_test.wav" + +model_size = "large-v3-turbo" + +model = WhisperModel(model_size, device="cuda", compute_type="float16") + +def load_model(file_path=None): + """ + Load given vosk model from file or default to en-us model. + Download model to user cache folder, example: C:/Users/toto/.cache/vosk + """ + return WhisperModel(model_size, device="cuda", compute_type="float16") + +def process_audio(): + """ + Transcript request audio file to text using Whisper + """ + + if model is None: + print(DEBUG_PREFIX,"Whisper model not initialized yet.") + return WhisperModel(model_size, device="cuda", compute_type="float16") + + try: + file = request.files.get('AudioFile') + language = request.form.get('language', default=None) + file.save(RECORDING_FILE_PATH) + segments, info = model.transcribe(RECORDING_FILE_PATH, beam_size=5) + transcript="" + for segment in segments: + transcript=transcript+" "+segment.text + print(DEBUG_PREFIX, "Transcripted from audio file (whisper):", transcript) + + return jsonify({"transcript": transcript}) + + except Exception as e: # No exception observed during test but we never know + print(e) + abort(500, DEBUG_PREFIX+" Exception occurs while processing audio") From 590320728152f5c0228a1667dd1e8c290148f8de Mon Sep 17 00:00:00 2001 From: ArrangingFear56 <82667894+ArrangingFear56@users.noreply.github.com> Date: Mon, 9 Dec 2024 17:11:09 +0800 Subject: [PATCH 02/15] Delete whisper_module.py --- whisper_module.py | 56 ----------------------------------------------- 1 file changed, 56 deletions(-) delete mode 100644 whisper_module.py diff --git a/whisper_module.py b/whisper_module.py deleted file mode 100644 index 551ac89..0000000 --- a/whisper_module.py +++ /dev/null @@ -1,56 +0,0 @@ -""" -Speech-to-text module based on Whisper for SillyTavern Extras - - Whisper github: https://github.com/openai/whisper - -Authors: - - Tony Ribeiro (https://github.com/Tony-sama) - -Models are saved into user cache folder, example: C:/Users/toto/.cache/whisper - -References: - - Code adapted from: - - whisper github: https://github.com/openai/whisper - - oobabooga text-generation-webui github: https://github.com/oobabooga/text-generation-webui -""" -from flask import jsonify, abort, request - -from faster_whisper import WhisperModel - -DEBUG_PREFIX = "" -RECORDING_FILE_PATH = "stt_test.wav" - -model_size = "large-v3-turbo" - -model = WhisperModel(model_size, device="cuda", compute_type="float16") - -def load_model(file_path=None): - """ - Load given vosk model from file or default to en-us model. - Download model to user cache folder, example: C:/Users/toto/.cache/vosk - """ - return WhisperModel(model_size, device="cuda", compute_type="float16") - -def process_audio(): - """ - Transcript request audio file to text using Whisper - """ - - if model is None: - print(DEBUG_PREFIX,"Whisper model not initialized yet.") - return WhisperModel(model_size, device="cuda", compute_type="float16") - - try: - file = request.files.get('AudioFile') - language = request.form.get('language', default=None) - file.save(RECORDING_FILE_PATH) - segments, info = model.transcribe(RECORDING_FILE_PATH, beam_size=5) - transcript="" - for segment in segments: - transcript=transcript+" "+segment.text - print(DEBUG_PREFIX, "Transcripted from audio file (whisper):", transcript) - - return jsonify({"transcript": transcript}) - - except Exception as e: # No exception observed during test but we never know - print(e) - abort(500, DEBUG_PREFIX+" Exception occurs while processing audio") From 24d025af0b2ed528e8e418b91dc9bfaf19617c46 Mon Sep 17 00:00:00 2001 From: ArrangingFear56 <82667894+ArrangingFear56@users.noreply.github.com> Date: Mon, 9 Dec 2024 17:11:29 +0800 Subject: [PATCH 03/15] Add files via upload --- modules/whisper_module.py | 56 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 modules/whisper_module.py diff --git a/modules/whisper_module.py b/modules/whisper_module.py new file mode 100644 index 0000000..551ac89 --- /dev/null +++ b/modules/whisper_module.py @@ -0,0 +1,56 @@ +""" +Speech-to-text module based on Whisper for SillyTavern Extras + - Whisper github: https://github.com/openai/whisper + +Authors: + - Tony Ribeiro (https://github.com/Tony-sama) + +Models are saved into user cache folder, example: C:/Users/toto/.cache/whisper + +References: + - Code adapted from: + - whisper github: https://github.com/openai/whisper + - oobabooga text-generation-webui github: https://github.com/oobabooga/text-generation-webui +""" +from flask import jsonify, abort, request + +from faster_whisper import WhisperModel + +DEBUG_PREFIX = "" +RECORDING_FILE_PATH = "stt_test.wav" + +model_size = "large-v3-turbo" + +model = WhisperModel(model_size, device="cuda", compute_type="float16") + +def load_model(file_path=None): + """ + Load given vosk model from file or default to en-us model. + Download model to user cache folder, example: C:/Users/toto/.cache/vosk + """ + return WhisperModel(model_size, device="cuda", compute_type="float16") + +def process_audio(): + """ + Transcript request audio file to text using Whisper + """ + + if model is None: + print(DEBUG_PREFIX,"Whisper model not initialized yet.") + return WhisperModel(model_size, device="cuda", compute_type="float16") + + try: + file = request.files.get('AudioFile') + language = request.form.get('language', default=None) + file.save(RECORDING_FILE_PATH) + segments, info = model.transcribe(RECORDING_FILE_PATH, beam_size=5) + transcript="" + for segment in segments: + transcript=transcript+" "+segment.text + print(DEBUG_PREFIX, "Transcripted from audio file (whisper):", transcript) + + return jsonify({"transcript": transcript}) + + except Exception as e: # No exception observed during test but we never know + print(e) + abort(500, DEBUG_PREFIX+" Exception occurs while processing audio") From ea996b19571705fbd3db42852980e8b57a834eea Mon Sep 17 00:00:00 2001 From: ArrangingFear56 <82667894+ArrangingFear56@users.noreply.github.com> Date: Mon, 9 Dec 2024 17:11:52 +0800 Subject: [PATCH 04/15] Delete modules/whisper_module.py --- modules/whisper_module.py | 56 --------------------------------------- 1 file changed, 56 deletions(-) delete mode 100644 modules/whisper_module.py diff --git a/modules/whisper_module.py b/modules/whisper_module.py deleted file mode 100644 index 551ac89..0000000 --- a/modules/whisper_module.py +++ /dev/null @@ -1,56 +0,0 @@ -""" -Speech-to-text module based on Whisper for SillyTavern Extras - - Whisper github: https://github.com/openai/whisper - -Authors: - - Tony Ribeiro (https://github.com/Tony-sama) - -Models are saved into user cache folder, example: C:/Users/toto/.cache/whisper - -References: - - Code adapted from: - - whisper github: https://github.com/openai/whisper - - oobabooga text-generation-webui github: https://github.com/oobabooga/text-generation-webui -""" -from flask import jsonify, abort, request - -from faster_whisper import WhisperModel - -DEBUG_PREFIX = "" -RECORDING_FILE_PATH = "stt_test.wav" - -model_size = "large-v3-turbo" - -model = WhisperModel(model_size, device="cuda", compute_type="float16") - -def load_model(file_path=None): - """ - Load given vosk model from file or default to en-us model. - Download model to user cache folder, example: C:/Users/toto/.cache/vosk - """ - return WhisperModel(model_size, device="cuda", compute_type="float16") - -def process_audio(): - """ - Transcript request audio file to text using Whisper - """ - - if model is None: - print(DEBUG_PREFIX,"Whisper model not initialized yet.") - return WhisperModel(model_size, device="cuda", compute_type="float16") - - try: - file = request.files.get('AudioFile') - language = request.form.get('language', default=None) - file.save(RECORDING_FILE_PATH) - segments, info = model.transcribe(RECORDING_FILE_PATH, beam_size=5) - transcript="" - for segment in segments: - transcript=transcript+" "+segment.text - print(DEBUG_PREFIX, "Transcripted from audio file (whisper):", transcript) - - return jsonify({"transcript": transcript}) - - except Exception as e: # No exception observed during test but we never know - print(e) - abort(500, DEBUG_PREFIX+" Exception occurs while processing audio") From 2fbbe98eda5dc4a92a8a5a4dbdb8a5578e106eeb Mon Sep 17 00:00:00 2001 From: ArrangingFear56 <82667894+ArrangingFear56@users.noreply.github.com> Date: Mon, 9 Dec 2024 17:12:04 +0800 Subject: [PATCH 05/15] Add files via upload --- modules/speech_recognition/whisper_module.py | 113 +++++++++---------- 1 file changed, 56 insertions(+), 57 deletions(-) diff --git a/modules/speech_recognition/whisper_module.py b/modules/speech_recognition/whisper_module.py index 056b849..551ac89 100644 --- a/modules/speech_recognition/whisper_module.py +++ b/modules/speech_recognition/whisper_module.py @@ -1,57 +1,56 @@ -""" -Speech-to-text module based on Whisper for SillyTavern Extras - - Whisper github: https://github.com/openai/whisper - -Authors: - - Tony Ribeiro (https://github.com/Tony-sama) - -Models are saved into user cache folder, example: C:/Users/toto/.cache/whisper - -References: - - Code adapted from: - - whisper github: https://github.com/openai/whisper - - oobabooga text-generation-webui github: https://github.com/oobabooga/text-generation-webui -""" -from flask import jsonify, abort, request - -import whisper - -DEBUG_PREFIX = "" -RECORDING_FILE_PATH = "stt_test.wav" - -model = None - -def load_model(file_path=None): - """ - Load given vosk model from file or default to en-us model. - Download model to user cache folder, example: C:/Users/toto/.cache/vosk - """ - - if file_path is None: - return whisper.load_model("base.en") - else: - return whisper.load_model(file_path) - -def process_audio(): - """ - Transcript request audio file to text using Whisper - """ - - if model is None: - print(DEBUG_PREFIX,"Whisper model not initialized yet.") - return "" - - try: - file = request.files.get('AudioFile') - language = request.form.get('language', default=None) - file.save(RECORDING_FILE_PATH) - - result = model.transcribe(RECORDING_FILE_PATH, condition_on_previous_text=False, language=language) - transcript = result["text"] - print(DEBUG_PREFIX, "Transcripted from audio file (whisper):", transcript) - - return jsonify({"transcript": transcript}) - - except Exception as e: # No exception observed during test but we never know - print(e) - abort(500, DEBUG_PREFIX+" Exception occurs while processing audio") +""" +Speech-to-text module based on Whisper for SillyTavern Extras + - Whisper github: https://github.com/openai/whisper + +Authors: + - Tony Ribeiro (https://github.com/Tony-sama) + +Models are saved into user cache folder, example: C:/Users/toto/.cache/whisper + +References: + - Code adapted from: + - whisper github: https://github.com/openai/whisper + - oobabooga text-generation-webui github: https://github.com/oobabooga/text-generation-webui +""" +from flask import jsonify, abort, request + +from faster_whisper import WhisperModel + +DEBUG_PREFIX = "" +RECORDING_FILE_PATH = "stt_test.wav" + +model_size = "large-v3-turbo" + +model = WhisperModel(model_size, device="cuda", compute_type="float16") + +def load_model(file_path=None): + """ + Load given vosk model from file or default to en-us model. + Download model to user cache folder, example: C:/Users/toto/.cache/vosk + """ + return WhisperModel(model_size, device="cuda", compute_type="float16") + +def process_audio(): + """ + Transcript request audio file to text using Whisper + """ + + if model is None: + print(DEBUG_PREFIX,"Whisper model not initialized yet.") + return WhisperModel(model_size, device="cuda", compute_type="float16") + + try: + file = request.files.get('AudioFile') + language = request.form.get('language', default=None) + file.save(RECORDING_FILE_PATH) + segments, info = model.transcribe(RECORDING_FILE_PATH, beam_size=5) + transcript="" + for segment in segments: + transcript=transcript+" "+segment.text + print(DEBUG_PREFIX, "Transcripted from audio file (whisper):", transcript) + + return jsonify({"transcript": transcript}) + + except Exception as e: # No exception observed during test but we never know + print(e) + abort(500, DEBUG_PREFIX+" Exception occurs while processing audio") From 6b3c1a3dd24d439bbd8d581520773f655aa571a3 Mon Sep 17 00:00:00 2001 From: ArrangingFear56 <82667894+ArrangingFear56@users.noreply.github.com> Date: Mon, 9 Dec 2024 17:16:35 +0800 Subject: [PATCH 06/15] Update fasterWhisperRequirements.txt --- fasterWhisperRequirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fasterWhisperRequirements.txt b/fasterWhisperRequirements.txt index a2cb6c0..d0aa851 100644 --- a/fasterWhisperRequirements.txt +++ b/fasterWhisperRequirements.txt @@ -3,4 +3,5 @@ huggingface_hub>=0.13 tokenizers>=0.13,<1 onnxruntime>=1.14,<2 av>=11 -tqdm \ No newline at end of file +tqdm +faster-whisper From 23e8a2cf43da6b099deb2a4d5c85a9a8303febf7 Mon Sep 17 00:00:00 2001 From: ArrangingFear56 Date: Mon, 9 Dec 2024 20:30:01 +0800 Subject: [PATCH 07/15] Revert "Add files via upload" This reverts commit 2fbbe98eda5dc4a92a8a5a4dbdb8a5578e106eeb. --- modules/speech_recognition/whisper_module.py | 113 ++++++++++--------- 1 file changed, 57 insertions(+), 56 deletions(-) diff --git a/modules/speech_recognition/whisper_module.py b/modules/speech_recognition/whisper_module.py index 551ac89..056b849 100644 --- a/modules/speech_recognition/whisper_module.py +++ b/modules/speech_recognition/whisper_module.py @@ -1,56 +1,57 @@ -""" -Speech-to-text module based on Whisper for SillyTavern Extras - - Whisper github: https://github.com/openai/whisper - -Authors: - - Tony Ribeiro (https://github.com/Tony-sama) - -Models are saved into user cache folder, example: C:/Users/toto/.cache/whisper - -References: - - Code adapted from: - - whisper github: https://github.com/openai/whisper - - oobabooga text-generation-webui github: https://github.com/oobabooga/text-generation-webui -""" -from flask import jsonify, abort, request - -from faster_whisper import WhisperModel - -DEBUG_PREFIX = "" -RECORDING_FILE_PATH = "stt_test.wav" - -model_size = "large-v3-turbo" - -model = WhisperModel(model_size, device="cuda", compute_type="float16") - -def load_model(file_path=None): - """ - Load given vosk model from file or default to en-us model. - Download model to user cache folder, example: C:/Users/toto/.cache/vosk - """ - return WhisperModel(model_size, device="cuda", compute_type="float16") - -def process_audio(): - """ - Transcript request audio file to text using Whisper - """ - - if model is None: - print(DEBUG_PREFIX,"Whisper model not initialized yet.") - return WhisperModel(model_size, device="cuda", compute_type="float16") - - try: - file = request.files.get('AudioFile') - language = request.form.get('language', default=None) - file.save(RECORDING_FILE_PATH) - segments, info = model.transcribe(RECORDING_FILE_PATH, beam_size=5) - transcript="" - for segment in segments: - transcript=transcript+" "+segment.text - print(DEBUG_PREFIX, "Transcripted from audio file (whisper):", transcript) - - return jsonify({"transcript": transcript}) - - except Exception as e: # No exception observed during test but we never know - print(e) - abort(500, DEBUG_PREFIX+" Exception occurs while processing audio") +""" +Speech-to-text module based on Whisper for SillyTavern Extras + - Whisper github: https://github.com/openai/whisper + +Authors: + - Tony Ribeiro (https://github.com/Tony-sama) + +Models are saved into user cache folder, example: C:/Users/toto/.cache/whisper + +References: + - Code adapted from: + - whisper github: https://github.com/openai/whisper + - oobabooga text-generation-webui github: https://github.com/oobabooga/text-generation-webui +""" +from flask import jsonify, abort, request + +import whisper + +DEBUG_PREFIX = "" +RECORDING_FILE_PATH = "stt_test.wav" + +model = None + +def load_model(file_path=None): + """ + Load given vosk model from file or default to en-us model. + Download model to user cache folder, example: C:/Users/toto/.cache/vosk + """ + + if file_path is None: + return whisper.load_model("base.en") + else: + return whisper.load_model(file_path) + +def process_audio(): + """ + Transcript request audio file to text using Whisper + """ + + if model is None: + print(DEBUG_PREFIX,"Whisper model not initialized yet.") + return "" + + try: + file = request.files.get('AudioFile') + language = request.form.get('language', default=None) + file.save(RECORDING_FILE_PATH) + + result = model.transcribe(RECORDING_FILE_PATH, condition_on_previous_text=False, language=language) + transcript = result["text"] + print(DEBUG_PREFIX, "Transcripted from audio file (whisper):", transcript) + + return jsonify({"transcript": transcript}) + + except Exception as e: # No exception observed during test but we never know + print(e) + abort(500, DEBUG_PREFIX+" Exception occurs while processing audio") From f3cb6c330b6c1bcabd55fca8b56f9f2dbdf97e56 Mon Sep 17 00:00:00 2001 From: ArrangingFear56 Date: Mon, 9 Dec 2024 20:37:48 +0800 Subject: [PATCH 08/15] Update whisper_module.py --- modules/speech_recognition/whisper_module.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/modules/speech_recognition/whisper_module.py b/modules/speech_recognition/whisper_module.py index 056b849..6b699f9 100644 --- a/modules/speech_recognition/whisper_module.py +++ b/modules/speech_recognition/whisper_module.py @@ -13,24 +13,24 @@ References: - oobabooga text-generation-webui github: https://github.com/oobabooga/text-generation-webui """ from flask import jsonify, abort, request - -import whisper +from faster_whisper import WhisperModel DEBUG_PREFIX = "" RECORDING_FILE_PATH = "stt_test.wav" -model = None +model_size = "large-v3-turbo" + +model = WhisperModel(model_size, device="cuda", compute_type="float16") def load_model(file_path=None): """ Load given vosk model from file or default to en-us model. Download model to user cache folder, example: C:/Users/toto/.cache/vosk """ - if file_path is None: - return whisper.load_model("base.en") + return WhisperModel(model_size, device="cuda", compute_type="float16") else: - return whisper.load_model(file_path) + return WhisperModel(file_path, device="cuda", compute_type="float16") def process_audio(): """ @@ -45,9 +45,10 @@ def process_audio(): file = request.files.get('AudioFile') language = request.form.get('language', default=None) file.save(RECORDING_FILE_PATH) - - result = model.transcribe(RECORDING_FILE_PATH, condition_on_previous_text=False, language=language) - transcript = result["text"] + segments, info = model.transcribe(RECORDING_FILE_PATH, beam_size=5) + transcript="" + for segment in segments: + transcript=transcript+" "+segment.text print(DEBUG_PREFIX, "Transcripted from audio file (whisper):", transcript) return jsonify({"transcript": transcript}) From 2ef37240b90d0abecc81f4d2b575843f0140626a Mon Sep 17 00:00:00 2001 From: ArrangingFear56 Date: Tue, 10 Dec 2024 00:08:53 +0800 Subject: [PATCH 09/15] Revert "Update whisper_module.py" This reverts commit f3cb6c330b6c1bcabd55fca8b56f9f2dbdf97e56. --- modules/speech_recognition/whisper_module.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/modules/speech_recognition/whisper_module.py b/modules/speech_recognition/whisper_module.py index 6b699f9..056b849 100644 --- a/modules/speech_recognition/whisper_module.py +++ b/modules/speech_recognition/whisper_module.py @@ -13,24 +13,24 @@ References: - oobabooga text-generation-webui github: https://github.com/oobabooga/text-generation-webui """ from flask import jsonify, abort, request -from faster_whisper import WhisperModel + +import whisper DEBUG_PREFIX = "" RECORDING_FILE_PATH = "stt_test.wav" -model_size = "large-v3-turbo" - -model = WhisperModel(model_size, device="cuda", compute_type="float16") +model = None def load_model(file_path=None): """ Load given vosk model from file or default to en-us model. Download model to user cache folder, example: C:/Users/toto/.cache/vosk """ + if file_path is None: - return WhisperModel(model_size, device="cuda", compute_type="float16") + return whisper.load_model("base.en") else: - return WhisperModel(file_path, device="cuda", compute_type="float16") + return whisper.load_model(file_path) def process_audio(): """ @@ -45,10 +45,9 @@ def process_audio(): file = request.files.get('AudioFile') language = request.form.get('language', default=None) file.save(RECORDING_FILE_PATH) - segments, info = model.transcribe(RECORDING_FILE_PATH, beam_size=5) - transcript="" - for segment in segments: - transcript=transcript+" "+segment.text + + result = model.transcribe(RECORDING_FILE_PATH, condition_on_previous_text=False, language=language) + transcript = result["text"] print(DEBUG_PREFIX, "Transcripted from audio file (whisper):", transcript) return jsonify({"transcript": transcript}) From dbec27b18a5613a9e1c46bf00a9e6efc9b12f7b7 Mon Sep 17 00:00:00 2001 From: ArrangingFear56 Date: Tue, 10 Dec 2024 00:55:46 +0800 Subject: [PATCH 10/15] added arguments for faster-whisper usage without breaking original whisper implementation --- .../faster_whisper_module.py | 60 +++++++++++++++++++ server.py | 34 +++++++++-- 2 files changed, 90 insertions(+), 4 deletions(-) create mode 100644 modules/speech_recognition/faster_whisper_module.py diff --git a/modules/speech_recognition/faster_whisper_module.py b/modules/speech_recognition/faster_whisper_module.py new file mode 100644 index 0000000..50f4c0b --- /dev/null +++ b/modules/speech_recognition/faster_whisper_module.py @@ -0,0 +1,60 @@ +""" +Speech-to-text module based on Whisper for SillyTavern Extras + - Whisper github: https://github.com/openai/whisper + +Authors: + - Tony Ribeiro (https://github.com/Tony-sama) + +Models are saved into user cache folder, example: C:/Users/toto/.cache/whisper + +References: + - Code adapted from: + - whisper github: https://github.com/openai/whisper + - oobabooga text-generation-webui github: https://github.com/oobabooga/text-generation-webui +""" +from flask import jsonify, abort, request + +from faster_whisper import WhisperModel + +DEBUG_PREFIX = "" +RECORDING_FILE_PATH = "stt_test.wav" + +model_size = "large-v3-turbo" + +def load_model(file_path=None,whisper_device="cuda",whisper_compute_type='float16'): + """ + Load given vosk model from file or default to en-us model. + Download model to user cache folder, example: C:/Users/toto/.cache/vosk + """ + + if file_path is None: + print(f"faster-whisper using {model_size}") + return WhisperModel(model_size, device=whisper_device, compute_type=whisper_compute_type) + else: + print(f"faster-whisper using {file_path}") + return WhisperModel(file_path, device=whisper_device, compute_type=whisper_compute_type) + +def process_audio(): + """ + Transcript request audio file to text using Whisper + """ + + if model is None: + print(DEBUG_PREFIX,"Whisper model not initialized yet.") + return "" + + try: + file = request.files.get('AudioFile') + language = request.form.get('language', default=None) + file.save(RECORDING_FILE_PATH) + segments, info = model.transcribe(RECORDING_FILE_PATH, beam_size=5) + transcript="" + for segment in segments: + transcript=transcript+" "+segment.text + print(DEBUG_PREFIX, "Transcripted from audio file (whisper):", transcript) + + return jsonify({"transcript": transcript}) + + except Exception as e: # No exception observed during test but we never know + print(e) + abort(500, DEBUG_PREFIX+" Exception occurs while processing audio") diff --git a/server.py b/server.py index 48b7d36..6a3548e 100644 --- a/server.py +++ b/server.py @@ -935,7 +935,12 @@ parser.add_argument("--max-content-length", help="Set the max") parser.add_argument("--rvc-save-file", action="store_true", help="Save the last rvc input/output audio file into data/tmp/ folder (for research)") parser.add_argument("--stt-vosk-model-path", help="Load a custom vosk speech-to-text model") -parser.add_argument("--stt-whisper-model-path", help="Load a custom vosk speech-to-text model") +parser.add_argument("--stt-whisper-model-path", help="Load a custom whisper speech-to-text model") + +parser.add_argument("--use-faster-whisper", action="store_true", help="Choose to use faster-whisper instead of whisper") +parser.add_argument("--faster-whisper-device", help="Choose between cpu and cuda to run faster-whisper, defaults to cuda") +parser.add_argument("--faster-whisper-type", help="Choose faster-whisper compute type, defaults to float16") + # sd_group = parser.add_mutually_exclusive_group() local_sd = parser.add_argument_group("sd-local") @@ -1161,15 +1166,36 @@ if "vosk-stt" in modules: app.add_url_rule("/api/speech-recognition/vosk/process-audio", view_func=vosk_module.process_audio, methods=["POST"]) if "whisper-stt" in modules: - print("Initializing Whisper speech-recognition (from ST request file)") + whisper_fast=( + True + if args.use_faster_whisper + else False) + whisper_model_path = ( args.stt_whisper_model_path if args.stt_whisper_model_path else None) - import modules.speech_recognition.whisper_module as whisper_module + if whisper_fast: + + faster_whisper_device=( + args.faster_whisper_device + if args.faster_whisper_device + else "cuda") + + faster_whisper_type=( + args.faster_whisper_type + if args.faster_whisper_type + else "float16") + + print(f"Initializing Faster-Whisper speech-recognition (from ST request file) on {faster_whisper_device}") + import modules.speech_recognition.faster_whisper_module as whisper_module + whisper_module.model = whisper_module.load_model(file_path=whisper_model_path,whisper_device=faster_whisper_device,whisper_compute_type=faster_whisper_type) + else: + print("Initializing Whisper speech-recognition (from ST request file)") + import modules.speech_recognition.whisper_module as whisper_module + whisper_module.model = whisper_module.load_model(file_path=whisper_model_path) - whisper_module.model = whisper_module.load_model(file_path=whisper_model_path) app.add_url_rule("/api/speech-recognition/whisper/process-audio", view_func=whisper_module.process_audio, methods=["POST"]) if "streaming-stt" in modules: From d86a8622c33ff67330d029b92c0353c82a9e1229 Mon Sep 17 00:00:00 2001 From: ArrangingFear56 Date: Tue, 10 Dec 2024 01:03:49 +0800 Subject: [PATCH 11/15] modified server.py for ease of choosing faster-whisper-device --- .gitignore | 1 + modules/speech_recognition/faster_whisper_module.py | 1 + server.py | 6 +++--- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index e385523..61324d0 100644 --- a/.gitignore +++ b/.gitignore @@ -141,3 +141,4 @@ api_key.txt stt_test.wav talkinghead/tha3/models docker/cache +launch.bat diff --git a/modules/speech_recognition/faster_whisper_module.py b/modules/speech_recognition/faster_whisper_module.py index 50f4c0b..aedb95a 100644 --- a/modules/speech_recognition/faster_whisper_module.py +++ b/modules/speech_recognition/faster_whisper_module.py @@ -21,6 +21,7 @@ RECORDING_FILE_PATH = "stt_test.wav" model_size = "large-v3-turbo" + def load_model(file_path=None,whisper_device="cuda",whisper_compute_type='float16'): """ Load given vosk model from file or default to en-us model. diff --git a/server.py b/server.py index 6a3548e..38054e0 100644 --- a/server.py +++ b/server.py @@ -938,7 +938,7 @@ parser.add_argument("--stt-vosk-model-path", help="Load a custom vosk speech-to- parser.add_argument("--stt-whisper-model-path", help="Load a custom whisper speech-to-text model") parser.add_argument("--use-faster-whisper", action="store_true", help="Choose to use faster-whisper instead of whisper") -parser.add_argument("--faster-whisper-device", help="Choose between cpu and cuda to run faster-whisper, defaults to cuda") +parser.add_argument("--faster-whisper-cpu", action="store_true", help="Use cpu to run faster-whisper, saves VRAM but much slower") parser.add_argument("--faster-whisper-type", help="Choose faster-whisper compute type, defaults to float16") # sd_group = parser.add_mutually_exclusive_group() @@ -1179,8 +1179,8 @@ if "whisper-stt" in modules: if whisper_fast: faster_whisper_device=( - args.faster_whisper_device - if args.faster_whisper_device + "cpu" + if args.faster_whisper_cpu else "cuda") faster_whisper_type=( From 6435ea40f180af9e731d0cc5b6608ee45a026eb0 Mon Sep 17 00:00:00 2001 From: ArrangingFear56 Date: Tue, 10 Dec 2024 01:23:13 +0800 Subject: [PATCH 12/15] change default computing types for different devices --- modules/speech_recognition/faster_whisper_module.py | 11 ++++++++--- server.py | 4 ++-- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/modules/speech_recognition/faster_whisper_module.py b/modules/speech_recognition/faster_whisper_module.py index aedb95a..c571257 100644 --- a/modules/speech_recognition/faster_whisper_module.py +++ b/modules/speech_recognition/faster_whisper_module.py @@ -22,17 +22,22 @@ RECORDING_FILE_PATH = "stt_test.wav" model_size = "large-v3-turbo" -def load_model(file_path=None,whisper_device="cuda",whisper_compute_type='float16'): +def load_model(file_path=None,whisper_device="cuda",whisper_compute_type="auto"): """ Load given vosk model from file or default to en-us model. Download model to user cache folder, example: C:/Users/toto/.cache/vosk """ + if whisper_compute_type=="auto": + whisper_compute_type=( + "int8" + if whisper_device=="cpu" + else "float16") if file_path is None: - print(f"faster-whisper using {model_size}") + print(f"faster-whisper using {model_size} model with {whisper_compute_type}") return WhisperModel(model_size, device=whisper_device, compute_type=whisper_compute_type) else: - print(f"faster-whisper using {file_path}") + print(f"faster-whisper using {file_path} model with {whisper_compute_type}") return WhisperModel(file_path, device=whisper_device, compute_type=whisper_compute_type) def process_audio(): diff --git a/server.py b/server.py index 38054e0..e821bd6 100644 --- a/server.py +++ b/server.py @@ -939,7 +939,7 @@ parser.add_argument("--stt-whisper-model-path", help="Load a custom whisper spee parser.add_argument("--use-faster-whisper", action="store_true", help="Choose to use faster-whisper instead of whisper") parser.add_argument("--faster-whisper-cpu", action="store_true", help="Use cpu to run faster-whisper, saves VRAM but much slower") -parser.add_argument("--faster-whisper-type", help="Choose faster-whisper compute type, defaults to float16") +parser.add_argument("--faster-whisper-type", help="Choose faster-whisper compute type, defaults to float16 for cuda and int8 for cpu") # sd_group = parser.add_mutually_exclusive_group() @@ -1186,7 +1186,7 @@ if "whisper-stt" in modules: faster_whisper_type=( args.faster_whisper_type if args.faster_whisper_type - else "float16") + else "auto") print(f"Initializing Faster-Whisper speech-recognition (from ST request file) on {faster_whisper_device}") import modules.speech_recognition.faster_whisper_module as whisper_module From 97251b1a0cc89a99f3d0c458d14d69dfe3a1e19f Mon Sep 17 00:00:00 2001 From: ArrangingFear56 <82667894+ArrangingFear56@users.noreply.github.com> Date: Tue, 10 Dec 2024 01:23:34 +0800 Subject: [PATCH 13/15] Update README.md --- README.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/README.md b/README.md index 3767cea..882963e 100644 --- a/README.md +++ b/README.md @@ -230,6 +230,19 @@ cd SillyTavern-extras | `--sd-remote-ssl` | Use SSL for the remote SD backend
Default: **False** | | `--sd-remote-auth` | Specify the `username:password` for the remote SD backend (if required) | +## For faster-whisper instead of whisper when using whisper(extras) +1. Install CUDA 12 and cuDNN 8 +2. Install faster-whisper requirements +``` +pip install -r fasterWhisperRequirements.txt +``` +use the `--use-faster-whisper` argument to switch to faster-whisper + +Optional: + +1. use the `--faster-whisper-type` argument to change compute mode for faster-whisper(ex. `--faster-whisper-type=int8`) +2. use the `--faster-whisper-cpu` argument to switch computing device to cpu + ## Coqui TTS ### Running on Mac M1 From fe91e0742f6204ea520cc23dae2a591ce0156e35 Mon Sep 17 00:00:00 2001 From: ArrangingFear56 Date: Wed, 11 Dec 2024 00:19:18 +0800 Subject: [PATCH 14/15] changed name of requirements of faster whisper to fit in line with the rest of the requirements --- ...irements.txt => requirements-faster-whisper.txt | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) rename fasterWhisperRequirements.txt => requirements-faster-whisper.txt (94%) diff --git a/fasterWhisperRequirements.txt b/requirements-faster-whisper.txt similarity index 94% rename from fasterWhisperRequirements.txt rename to requirements-faster-whisper.txt index d0aa851..76a9c65 100644 --- a/fasterWhisperRequirements.txt +++ b/requirements-faster-whisper.txt @@ -1,7 +1,7 @@ -ctranslate2==4.4.0 -huggingface_hub>=0.13 -tokenizers>=0.13,<1 -onnxruntime>=1.14,<2 -av>=11 -tqdm -faster-whisper +ctranslate2==4.4.0 +huggingface_hub>=0.13 +tokenizers>=0.13,<1 +onnxruntime>=1.14,<2 +av>=11 +tqdm +faster-whisper From 028b798073f45601712c242adf79ce89a4da8952 Mon Sep 17 00:00:00 2001 From: ArrangingFear56 <82667894+ArrangingFear56@users.noreply.github.com> Date: Wed, 11 Dec 2024 00:21:29 +0800 Subject: [PATCH 15/15] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 882963e..c1dd9b4 100644 --- a/README.md +++ b/README.md @@ -234,7 +234,7 @@ cd SillyTavern-extras 1. Install CUDA 12 and cuDNN 8 2. Install faster-whisper requirements ``` -pip install -r fasterWhisperRequirements.txt +pip install -r requirements-faster-whisper.txt ``` use the `--use-faster-whisper` argument to switch to faster-whisper