From 9d55444995a93aba06fae10bcff6e65d3f3e6e88 Mon Sep 17 00:00:00 2001
From: ArrangingFear56 <82667894+ArrangingFear56@users.noreply.github.com>
Date: Mon, 9 Dec 2024 17:09:56 +0800
Subject: [PATCH 01/15] Add files via upload

---
 fasterWhisperRequirements.txt |  6 ++++
 whisper_module.py             | 56 +++++++++++++++++++++++++++++++++++
 2 files changed, 62 insertions(+)
 create mode 100644 fasterWhisperRequirements.txt
 create mode 100644 whisper_module.py
diff --git a/fasterWhisperRequirements.txt b/fasterWhisperRequirements.txt
new file mode 100644
index 0000000..a2cb6c0
--- /dev/null
+++ b/fasterWhisperRequirements.txt
@@ -0,0 +1,6 @@
+ctranslate2==4.4.0
+huggingface_hub>=0.13
+tokenizers>=0.13,<1
+onnxruntime>=1.14,<2 
+av>=11
+tqdm
\ No newline at end of file
diff --git a/whisper_module.py b/whisper_module.py
new file mode 100644
index 0000000..551ac89
--- /dev/null
+++ b/whisper_module.py
@@ -0,0 +1,56 @@
+"""
+Speech-to-text module based on Whisper for SillyTavern Extras
+    - Whisper github: https://github.com/openai/whisper
+
+Authors:
+    - Tony Ribeiro (https://github.com/Tony-sama)
+
+Models are saved into user cache folder, example: C:/Users/toto/.cache/whisper
+
+References:
+    - Code adapted from:
+        - whisper github: https://github.com/openai/whisper
+        - oobabooga text-generation-webui github: https://github.com/oobabooga/text-generation-webui
+"""
+from flask import jsonify, abort, request
+
+from faster_whisper import WhisperModel
+
+DEBUG_PREFIX = "<stt whisper module>"
+RECORDING_FILE_PATH = "stt_test.wav"
+
+model_size = "large-v3-turbo"
+
+model = WhisperModel(model_size, device="cuda", compute_type="float16")
+
+def load_model(file_path=None):
+    """
+    Load given vosk model from file or default to en-us model.
+    Download model to user cache folder, example: C:/Users/toto/.cache/vosk
+    """
+    return WhisperModel(model_size, device="cuda", compute_type="float16")
+
+def process_audio():
+    """
+    Transcript request audio file to text using Whisper
+    """
+
+    if model is None:
+        print(DEBUG_PREFIX,"Whisper model not initialized yet.")
+        return WhisperModel(model_size, device="cuda", compute_type="float16")
+
+    try:
+        file = request.files.get('AudioFile')
+        language = request.form.get('language', default=None)
+        file.save(RECORDING_FILE_PATH)
+        segments, info = model.transcribe(RECORDING_FILE_PATH, beam_size=5)
+        transcript=""
+        for segment in segments:
+            transcript=transcript+" "+segment.text
+        print(DEBUG_PREFIX, "Transcripted from audio file (whisper):", transcript)
+
+        return jsonify({"transcript": transcript})
+
+    except Exception as e: # No exception observed during test but we never know
+        print(e)
+        abort(500, DEBUG_PREFIX+" Exception occurs while processing audio")

From 590320728152f5c0228a1667dd1e8c290148f8de Mon Sep 17 00:00:00 2001
From: ArrangingFear56 <82667894+ArrangingFear56@users.noreply.github.com>
Date: Mon, 9 Dec 2024 17:11:09 +0800
Subject: [PATCH 02/15] Delete whisper_module.py

---
 whisper_module.py | 56 -----------------------------------------------
 1 file changed, 56 deletions(-)
 delete mode 100644 whisper_module.py

diff --git a/whisper_module.py b/whisper_module.py
deleted file mode 100644
index 551ac89..0000000
--- a/whisper_module.py
+++ /dev/null
@@ -1,56 +0,0 @@
-"""
-Speech-to-text module based on Whisper for SillyTavern Extras
-    - Whisper github: https://github.com/openai/whisper
-
-Authors:
-    - Tony Ribeiro (https://github.com/Tony-sama)
-
-Models are saved into user cache folder, example: C:/Users/toto/.cache/whisper
-
-References:
-    - Code adapted from:
-        - whisper github: https://github.com/openai/whisper
-        - oobabooga text-generation-webui github: https://github.com/oobabooga/text-generation-webui
-"""
-from flask import jsonify, abort, request
-
-from faster_whisper import WhisperModel
-
-DEBUG_PREFIX = "<stt whisper module>"
-RECORDING_FILE_PATH = "stt_test.wav"
-
-model_size = "large-v3-turbo"
-
-model = WhisperModel(model_size, device="cuda", compute_type="float16")
-
-def load_model(file_path=None):
-    """
-    Load given vosk model from file or default to en-us model.
-    Download model to user cache folder, example: C:/Users/toto/.cache/vosk
-    """
-    return WhisperModel(model_size, device="cuda", compute_type="float16")
-
-def process_audio():
-    """
-    Transcript request audio file to text using Whisper
-    """
-
-    if model is None:
-        print(DEBUG_PREFIX,"Whisper model not initialized yet.")
-        return WhisperModel(model_size, device="cuda", compute_type="float16")
-
-    try:
-        file = request.files.get('AudioFile')
-        language = request.form.get('language', default=None)
-        file.save(RECORDING_FILE_PATH)
-        segments, info = model.transcribe(RECORDING_FILE_PATH, beam_size=5)
-        transcript=""
-        for segment in segments:
-            transcript=transcript+" "+segment.text
-        print(DEBUG_PREFIX, "Transcripted from audio file (whisper):", transcript)
-
-        return jsonify({"transcript": transcript})
-
-    except Exception as e: # No exception observed during test but we never know
-        print(e)
-        abort(500, DEBUG_PREFIX+" Exception occurs while processing audio")

From 24d025af0b2ed528e8e418b91dc9bfaf19617c46 Mon Sep 17 00:00:00 2001
From: ArrangingFear56 <82667894+ArrangingFear56@users.noreply.github.com>
Date: Mon, 9 Dec 2024 17:11:29 +0800
Subject: [PATCH 03/15] Add files via upload

---
 modules/whisper_module.py | 56 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 56 insertions(+)
 create mode 100644 modules/whisper_module.py

diff --git a/modules/whisper_module.py b/modules/whisper_module.py
new file mode 100644
index 0000000..551ac89
--- /dev/null
+++ b/modules/whisper_module.py
@@ -0,0 +1,56 @@
+"""
+Speech-to-text module based on Whisper for SillyTavern Extras
+    - Whisper github: https://github.com/openai/whisper
+
+Authors:
+    - Tony Ribeiro (https://github.com/Tony-sama)
+
+Models are saved into user cache folder, example: C:/Users/toto/.cache/whisper
+
+References:
+    - Code adapted from:
+        - whisper github: https://github.com/openai/whisper
+        - oobabooga text-generation-webui github: https://github.com/oobabooga/text-generation-webui
+"""
+from flask import jsonify, abort, request
+
+from faster_whisper import WhisperModel
+
+DEBUG_PREFIX = "<stt whisper module>"
+RECORDING_FILE_PATH = "stt_test.wav"
+
+model_size = "large-v3-turbo"
+
+model = WhisperModel(model_size, device="cuda", compute_type="float16")
+
+def load_model(file_path=None):
+    """
+    Load given vosk model from file or default to en-us model.
+    Download model to user cache folder, example: C:/Users/toto/.cache/vosk
+    """
+    return WhisperModel(model_size, device="cuda", compute_type="float16")
+
+def process_audio():
+    """
+    Transcript request audio file to text using Whisper
+    """
+
+    if model is None:
+        print(DEBUG_PREFIX,"Whisper model not initialized yet.")
+        return WhisperModel(model_size, device="cuda", compute_type="float16")
+
+    try:
+        file = request.files.get('AudioFile')
+        language = request.form.get('language', default=None)
+        file.save(RECORDING_FILE_PATH)
+        segments, info = model.transcribe(RECORDING_FILE_PATH, beam_size=5)
+        transcript=""
+        for segment in segments:
+            transcript=transcript+" "+segment.text
+        print(DEBUG_PREFIX, "Transcripted from audio file (whisper):", transcript)
+
+        return jsonify({"transcript": transcript})
+
+    except Exception as e: # No exception observed during test but we never know
+        print(e)
+        abort(500, DEBUG_PREFIX+" Exception occurs while processing audio")

From ea996b19571705fbd3db42852980e8b57a834eea Mon Sep 17 00:00:00 2001
From: ArrangingFear56 <82667894+ArrangingFear56@users.noreply.github.com>
Date: Mon, 9 Dec 2024 17:11:52 +0800
Subject: [PATCH 04/15] Delete modules/whisper_module.py

---
 modules/whisper_module.py | 56 ---------------------------------------
 1 file changed, 56 deletions(-)
 delete mode 100644 modules/whisper_module.py

diff --git a/modules/whisper_module.py b/modules/whisper_module.py
deleted file mode 100644
index 551ac89..0000000
--- a/modules/whisper_module.py
+++ /dev/null
@@ -1,56 +0,0 @@
-"""
-Speech-to-text module based on Whisper for SillyTavern Extras
-    - Whisper github: https://github.com/openai/whisper
-
-Authors:
-    - Tony Ribeiro (https://github.com/Tony-sama)
-
-Models are saved into user cache folder, example: C:/Users/toto/.cache/whisper
-
-References:
-    - Code adapted from:
-        - whisper github: https://github.com/openai/whisper
-        - oobabooga text-generation-webui github: https://github.com/oobabooga/text-generation-webui
-"""
-from flask import jsonify, abort, request
-
-from faster_whisper import WhisperModel
-
-DEBUG_PREFIX = "<stt whisper module>"
-RECORDING_FILE_PATH = "stt_test.wav"
-
-model_size = "large-v3-turbo"
-
-model = WhisperModel(model_size, device="cuda", compute_type="float16")
-
-def load_model(file_path=None):
-    """
-    Load given vosk model from file or default to en-us model.
-    Download model to user cache folder, example: C:/Users/toto/.cache/vosk
-    """
-    return WhisperModel(model_size, device="cuda", compute_type="float16")
-
-def process_audio():
-    """
-    Transcript request audio file to text using Whisper
-    """
-
-    if model is None:
-        print(DEBUG_PREFIX,"Whisper model not initialized yet.")
-        return WhisperModel(model_size, device="cuda", compute_type="float16")
-
-    try:
-        file = request.files.get('AudioFile')
-        language = request.form.get('language', default=None)
-        file.save(RECORDING_FILE_PATH)
-        segments, info = model.transcribe(RECORDING_FILE_PATH, beam_size=5)
-        transcript=""
-        for segment in segments:
-            transcript=transcript+" "+segment.text
-        print(DEBUG_PREFIX, "Transcripted from audio file (whisper):", transcript)
-
-        return jsonify({"transcript": transcript})
-
-    except Exception as e: # No exception observed during test but we never know
-        print(e)
-        abort(500, DEBUG_PREFIX+" Exception occurs while processing audio")

From 2fbbe98eda5dc4a92a8a5a4dbdb8a5578e106eeb Mon Sep 17 00:00:00 2001
From: ArrangingFear56 <82667894+ArrangingFear56@users.noreply.github.com>
Date: Mon, 9 Dec 2024 17:12:04 +0800
Subject: [PATCH 05/15] Add files via upload

---
 modules/speech_recognition/whisper_module.py | 113 +++++++++----------
 1 file changed, 56 insertions(+), 57 deletions(-)

diff --git a/modules/speech_recognition/whisper_module.py b/modules/speech_recognition/whisper_module.py
index 056b849..551ac89 100644
--- a/modules/speech_recognition/whisper_module.py
+++ b/modules/speech_recognition/whisper_module.py
@@ -1,57 +1,56 @@
-"""
-Speech-to-text module based on Whisper for SillyTavern Extras
-    - Whisper github: https://github.com/openai/whisper
-
-Authors:
-    - Tony Ribeiro (https://github.com/Tony-sama)
-
-Models are saved into user cache folder, example: C:/Users/toto/.cache/whisper
-
-References:
-    - Code adapted from:
-        - whisper github: https://github.com/openai/whisper
-        - oobabooga text-generation-webui github: https://github.com/oobabooga/text-generation-webui
-"""
-from flask import jsonify, abort, request
-
-import whisper
-
-DEBUG_PREFIX = "<stt whisper module>"
-RECORDING_FILE_PATH = "stt_test.wav"
-
-model = None
-
-def load_model(file_path=None):
-    """
-    Load given vosk model from file or default to en-us model.
-    Download model to user cache folder, example: C:/Users/toto/.cache/vosk
-    """
-
-    if file_path is None:
-        return whisper.load_model("base.en")
-    else:
-        return whisper.load_model(file_path)
-
-def process_audio():
-    """
-    Transcript request audio file to text using Whisper
-    """
-
-    if model is None:
-        print(DEBUG_PREFIX,"Whisper model not initialized yet.")
-        return ""
-
-    try:
-        file = request.files.get('AudioFile')
-        language = request.form.get('language', default=None)
-        file.save(RECORDING_FILE_PATH)
-
-        result = model.transcribe(RECORDING_FILE_PATH, condition_on_previous_text=False, language=language)
-        transcript = result["text"]
-        print(DEBUG_PREFIX, "Transcripted from audio file (whisper):", transcript)
-
-        return jsonify({"transcript": transcript})
-
-    except Exception as e: # No exception observed during test but we never know
-        print(e)
-        abort(500, DEBUG_PREFIX+" Exception occurs while processing audio")
+"""
+Speech-to-text module based on Whisper for SillyTavern Extras
+    - Whisper github: https://github.com/openai/whisper
+
+Authors:
+    - Tony Ribeiro (https://github.com/Tony-sama)
+
+Models are saved into user cache folder, example: C:/Users/toto/.cache/whisper
+
+References:
+    - Code adapted from:
+        - whisper github: https://github.com/openai/whisper
+        - oobabooga text-generation-webui github: https://github.com/oobabooga/text-generation-webui
+"""
+from flask import jsonify, abort, request
+
+from faster_whisper import WhisperModel
+
+DEBUG_PREFIX = "<stt whisper module>"
+RECORDING_FILE_PATH = "stt_test.wav"
+
+model_size = "large-v3-turbo"
+
+model = WhisperModel(model_size, device="cuda", compute_type="float16")
+
+def load_model(file_path=None):
+    """
+    Load given vosk model from file or default to en-us model.
+    Download model to user cache folder, example: C:/Users/toto/.cache/vosk
+    """
+    return WhisperModel(model_size, device="cuda", compute_type="float16")
+
+def process_audio():
+    """
+    Transcript request audio file to text using Whisper
+    """
+
+    if model is None:
+        print(DEBUG_PREFIX,"Whisper model not initialized yet.")
+        return WhisperModel(model_size, device="cuda", compute_type="float16")
+
+    try:
+        file = request.files.get('AudioFile')
+        language = request.form.get('language', default=None)
+        file.save(RECORDING_FILE_PATH)
+        segments, info = model.transcribe(RECORDING_FILE_PATH, beam_size=5)
+        transcript=""
+        for segment in segments:
+            transcript=transcript+" "+segment.text
+        print(DEBUG_PREFIX, "Transcripted from audio file (whisper):", transcript)
+
+        return jsonify({"transcript": transcript})
+
+    except Exception as e: # No exception observed during test but we never know
+        print(e)
+        abort(500, DEBUG_PREFIX+" Exception occurs while processing audio")

From 6b3c1a3dd24d439bbd8d581520773f655aa571a3 Mon Sep 17 00:00:00 2001
From: ArrangingFear56 <82667894+ArrangingFear56@users.noreply.github.com>
Date: Mon, 9 Dec 2024 17:16:35 +0800
Subject: [PATCH 06/15] Update fasterWhisperRequirements.txt

---
 fasterWhisperRequirements.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fasterWhisperRequirements.txt b/fasterWhisperRequirements.txt
index a2cb6c0..d0aa851 100644
--- a/fasterWhisperRequirements.txt
+++ b/fasterWhisperRequirements.txt
@@ -3,4 +3,5 @@ huggingface_hub>=0.13
 tokenizers>=0.13,<1
 onnxruntime>=1.14,<2 
 av>=11
-tqdm
\ No newline at end of file
+tqdm
+faster-whisper

From 23e8a2cf43da6b099deb2a4d5c85a9a8303febf7 Mon Sep 17 00:00:00 2001
From: ArrangingFear56 <arrangingfear56@gmail.com>
Date: Mon, 9 Dec 2024 20:30:01 +0800
Subject: [PATCH 07/15] Revert "Add files via upload"

This reverts commit 2fbbe98eda5dc4a92a8a5a4dbdb8a5578e106eeb.
---
 modules/speech_recognition/whisper_module.py | 113 ++++++++++---------
 1 file changed, 57 insertions(+), 56 deletions(-)

diff --git a/modules/speech_recognition/whisper_module.py b/modules/speech_recognition/whisper_module.py
index 551ac89..056b849 100644
--- a/modules/speech_recognition/whisper_module.py
+++ b/modules/speech_recognition/whisper_module.py
@@ -1,56 +1,57 @@
-"""
-Speech-to-text module based on Whisper for SillyTavern Extras
-    - Whisper github: https://github.com/openai/whisper
-
-Authors:
-    - Tony Ribeiro (https://github.com/Tony-sama)
-
-Models are saved into user cache folder, example: C:/Users/toto/.cache/whisper
-
-References:
-    - Code adapted from:
-        - whisper github: https://github.com/openai/whisper
-        - oobabooga text-generation-webui github: https://github.com/oobabooga/text-generation-webui
-"""
-from flask import jsonify, abort, request
-
-from faster_whisper import WhisperModel
-
-DEBUG_PREFIX = "<stt whisper module>"
-RECORDING_FILE_PATH = "stt_test.wav"
-
-model_size = "large-v3-turbo"
-
-model = WhisperModel(model_size, device="cuda", compute_type="float16")
-
-def load_model(file_path=None):
-    """
-    Load given vosk model from file or default to en-us model.
-    Download model to user cache folder, example: C:/Users/toto/.cache/vosk
-    """
-    return WhisperModel(model_size, device="cuda", compute_type="float16")
-
-def process_audio():
-    """
-    Transcript request audio file to text using Whisper
-    """
-
-    if model is None:
-        print(DEBUG_PREFIX,"Whisper model not initialized yet.")
-        return WhisperModel(model_size, device="cuda", compute_type="float16")
-
-    try:
-        file = request.files.get('AudioFile')
-        language = request.form.get('language', default=None)
-        file.save(RECORDING_FILE_PATH)
-        segments, info = model.transcribe(RECORDING_FILE_PATH, beam_size=5)
-        transcript=""
-        for segment in segments:
-            transcript=transcript+" "+segment.text
-        print(DEBUG_PREFIX, "Transcripted from audio file (whisper):", transcript)
-
-        return jsonify({"transcript": transcript})
-
-    except Exception as e: # No exception observed during test but we never know
-        print(e)
-        abort(500, DEBUG_PREFIX+" Exception occurs while processing audio")
+"""
+Speech-to-text module based on Whisper for SillyTavern Extras
+    - Whisper github: https://github.com/openai/whisper
+
+Authors:
+    - Tony Ribeiro (https://github.com/Tony-sama)
+
+Models are saved into user cache folder, example: C:/Users/toto/.cache/whisper
+
+References:
+    - Code adapted from:
+        - whisper github: https://github.com/openai/whisper
+        - oobabooga text-generation-webui github: https://github.com/oobabooga/text-generation-webui
+"""
+from flask import jsonify, abort, request
+
+import whisper
+
+DEBUG_PREFIX = "<stt whisper module>"
+RECORDING_FILE_PATH = "stt_test.wav"
+
+model = None
+
+def load_model(file_path=None):
+    """
+    Load given vosk model from file or default to en-us model.
+    Download model to user cache folder, example: C:/Users/toto/.cache/vosk
+    """
+
+    if file_path is None:
+        return whisper.load_model("base.en")
+    else:
+        return whisper.load_model(file_path)
+
+def process_audio():
+    """
+    Transcript request audio file to text using Whisper
+    """
+
+    if model is None:
+        print(DEBUG_PREFIX,"Whisper model not initialized yet.")
+        return ""
+
+    try:
+        file = request.files.get('AudioFile')
+        language = request.form.get('language', default=None)
+        file.save(RECORDING_FILE_PATH)
+
+        result = model.transcribe(RECORDING_FILE_PATH, condition_on_previous_text=False, language=language)
+        transcript = result["text"]
+        print(DEBUG_PREFIX, "Transcripted from audio file (whisper):", transcript)
+
+        return jsonify({"transcript": transcript})
+
+    except Exception as e: # No exception observed during test but we never know
+        print(e)
+        abort(500, DEBUG_PREFIX+" Exception occurs while processing audio")

From f3cb6c330b6c1bcabd55fca8b56f9f2dbdf97e56 Mon Sep 17 00:00:00 2001
From: ArrangingFear56 <arrangingfear56@gmail.com>
Date: Mon, 9 Dec 2024 20:37:48 +0800
Subject: [PATCH 08/15] Update whisper_module.py

---
 modules/speech_recognition/whisper_module.py | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/modules/speech_recognition/whisper_module.py b/modules/speech_recognition/whisper_module.py
index 056b849..6b699f9 100644
--- a/modules/speech_recognition/whisper_module.py
+++ b/modules/speech_recognition/whisper_module.py
@@ -13,24 +13,24 @@ References:
         - oobabooga text-generation-webui github: https://github.com/oobabooga/text-generation-webui
 """
 from flask import jsonify, abort, request
-
-import whisper
+from faster_whisper import WhisperModel
 
 DEBUG_PREFIX = "<stt whisper module>"
 RECORDING_FILE_PATH = "stt_test.wav"
 
-model = None
+model_size = "large-v3-turbo"
+
+model = WhisperModel(model_size, device="cuda", compute_type="float16")
 
 def load_model(file_path=None):
     """
     Load given vosk model from file or default to en-us model.
     Download model to user cache folder, example: C:/Users/toto/.cache/vosk
     """
-
     if file_path is None:
-        return whisper.load_model("base.en")
+        return WhisperModel(model_size, device="cuda", compute_type="float16")
     else:
-        return whisper.load_model(file_path)
+        return WhisperModel(file_path, device="cuda", compute_type="float16")
 
 def process_audio():
     """
@@ -45,9 +45,10 @@ def process_audio():
         file = request.files.get('AudioFile')
         language = request.form.get('language', default=None)
         file.save(RECORDING_FILE_PATH)
-
-        result = model.transcribe(RECORDING_FILE_PATH, condition_on_previous_text=False, language=language)
-        transcript = result["text"]
+        segments, info = model.transcribe(RECORDING_FILE_PATH, beam_size=5)
+        transcript=""
+        for segment in segments:
+            transcript=transcript+" "+segment.text
         print(DEBUG_PREFIX, "Transcripted from audio file (whisper):", transcript)
 
         return jsonify({"transcript": transcript})

From 2ef37240b90d0abecc81f4d2b575843f0140626a Mon Sep 17 00:00:00 2001
From: ArrangingFear56 <arrangingfear56@gmail.com>
Date: Tue, 10 Dec 2024 00:08:53 +0800
Subject: [PATCH 09/15] Revert "Update whisper_module.py"

This reverts commit f3cb6c330b6c1bcabd55fca8b56f9f2dbdf97e56.
---
 modules/speech_recognition/whisper_module.py | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/modules/speech_recognition/whisper_module.py b/modules/speech_recognition/whisper_module.py
index 6b699f9..056b849 100644
--- a/modules/speech_recognition/whisper_module.py
+++ b/modules/speech_recognition/whisper_module.py
@@ -13,24 +13,24 @@ References:
         - oobabooga text-generation-webui github: https://github.com/oobabooga/text-generation-webui
 """
 from flask import jsonify, abort, request
-from faster_whisper import WhisperModel
+
+import whisper
 
 DEBUG_PREFIX = "<stt whisper module>"
 RECORDING_FILE_PATH = "stt_test.wav"
 
-model_size = "large-v3-turbo"
-
-model = WhisperModel(model_size, device="cuda", compute_type="float16")
+model = None
 
 def load_model(file_path=None):
     """
     Load given vosk model from file or default to en-us model.
     Download model to user cache folder, example: C:/Users/toto/.cache/vosk
     """
+
     if file_path is None:
-        return WhisperModel(model_size, device="cuda", compute_type="float16")
+        return whisper.load_model("base.en")
     else:
-        return WhisperModel(file_path, device="cuda", compute_type="float16")
+        return whisper.load_model(file_path)
 
 def process_audio():
     """
@@ -45,10 +45,9 @@ def process_audio():
         file = request.files.get('AudioFile')
         language = request.form.get('language', default=None)
         file.save(RECORDING_FILE_PATH)
-        segments, info = model.transcribe(RECORDING_FILE_PATH, beam_size=5)
-        transcript=""
-        for segment in segments:
-            transcript=transcript+" "+segment.text
+
+        result = model.transcribe(RECORDING_FILE_PATH, condition_on_previous_text=False, language=language)
+        transcript = result["text"]
         print(DEBUG_PREFIX, "Transcripted from audio file (whisper):", transcript)
 
         return jsonify({"transcript": transcript})

From dbec27b18a5613a9e1c46bf00a9e6efc9b12f7b7 Mon Sep 17 00:00:00 2001
From: ArrangingFear56 <arrangingfear56@gmail.com>
Date: Tue, 10 Dec 2024 00:55:46 +0800
Subject: [PATCH 10/15] added arguments for faster-whisper usage without
 breaking original whisper implementation

---
 .../faster_whisper_module.py                  | 60 +++++++++++++++++++
 server.py                                     | 34 +++++++++--
 2 files changed, 90 insertions(+), 4 deletions(-)
 create mode 100644 modules/speech_recognition/faster_whisper_module.py

diff --git a/modules/speech_recognition/faster_whisper_module.py b/modules/speech_recognition/faster_whisper_module.py
new file mode 100644
index 0000000..50f4c0b
--- /dev/null
+++ b/modules/speech_recognition/faster_whisper_module.py
@@ -0,0 +1,60 @@
+"""
+Speech-to-text module based on Whisper for SillyTavern Extras
+    - Whisper github: https://github.com/openai/whisper
+
+Authors:
+    - Tony Ribeiro (https://github.com/Tony-sama)
+
+Models are saved into user cache folder, example: C:/Users/toto/.cache/whisper
+
+References:
+    - Code adapted from:
+        - whisper github: https://github.com/openai/whisper
+        - oobabooga text-generation-webui github: https://github.com/oobabooga/text-generation-webui
+"""
+from flask import jsonify, abort, request
+
+from faster_whisper import WhisperModel
+
+DEBUG_PREFIX = "<stt whisper module>"
+RECORDING_FILE_PATH = "stt_test.wav"
+
+model_size = "large-v3-turbo"
+
+def load_model(file_path=None,whisper_device="cuda",whisper_compute_type='float16'):
+    """
+    Load given vosk model from file or default to en-us model.
+    Download model to user cache folder, example: C:/Users/toto/.cache/vosk
+    """
+
+    if file_path is None:
+        print(f"faster-whisper using {model_size}")
+        return WhisperModel(model_size, device=whisper_device, compute_type=whisper_compute_type)
+    else:
+        print(f"faster-whisper using {file_path}")
+        return WhisperModel(file_path, device=whisper_device, compute_type=whisper_compute_type)
+
+def process_audio():
+    """
+    Transcript request audio file to text using Whisper
+    """
+
+    if model is None:
+        print(DEBUG_PREFIX,"Whisper model not initialized yet.")
+        return ""
+
+    try:
+        file = request.files.get('AudioFile')
+        language = request.form.get('language', default=None)
+        file.save(RECORDING_FILE_PATH)
+        segments, info = model.transcribe(RECORDING_FILE_PATH, beam_size=5)
+        transcript=""
+        for segment in segments:
+            transcript=transcript+" "+segment.text
+        print(DEBUG_PREFIX, "Transcripted from audio file (whisper):", transcript)
+
+        return jsonify({"transcript": transcript})
+
+    except Exception as e: # No exception observed during test but we never know
+        print(e)
+        abort(500, DEBUG_PREFIX+" Exception occurs while processing audio")
diff --git a/server.py b/server.py
index 48b7d36..6a3548e 100644
--- a/server.py
+++ b/server.py
@@ -935,7 +935,12 @@ parser.add_argument("--max-content-length", help="Set the max")
 parser.add_argument("--rvc-save-file", action="store_true", help="Save the last rvc input/output audio file into data/tmp/ folder (for research)")
 
 parser.add_argument("--stt-vosk-model-path", help="Load a custom vosk speech-to-text model")
-parser.add_argument("--stt-whisper-model-path", help="Load a custom vosk speech-to-text model")
+parser.add_argument("--stt-whisper-model-path", help="Load a custom whisper speech-to-text model")
+
+parser.add_argument("--use-faster-whisper", action="store_true", help="Choose to use faster-whisper instead of whisper")
+parser.add_argument("--faster-whisper-device", help="Choose between cpu and cuda to run faster-whisper, defaults to cuda")
+parser.add_argument("--faster-whisper-type", help="Choose faster-whisper compute type, defaults to float16")
+
 # sd_group = parser.add_mutually_exclusive_group()
 
 local_sd = parser.add_argument_group("sd-local")
@@ -1161,15 +1166,36 @@ if "vosk-stt" in modules:
     app.add_url_rule("/api/speech-recognition/vosk/process-audio", view_func=vosk_module.process_audio, methods=["POST"])
 
 if "whisper-stt" in modules:
-    print("Initializing Whisper speech-recognition (from ST request file)")
+    whisper_fast=(
+        True
+        if args.use_faster_whisper
+        else False)
+
     whisper_model_path = (
         args.stt_whisper_model_path
         if args.stt_whisper_model_path
         else None)
 
-    import modules.speech_recognition.whisper_module as whisper_module
+    if whisper_fast:
+
+        faster_whisper_device=(
+            args.faster_whisper_device
+            if args.faster_whisper_device
+            else "cuda")
+
+        faster_whisper_type=(
+            args.faster_whisper_type
+            if args.faster_whisper_type
+            else "float16")
+
+        print(f"Initializing Faster-Whisper speech-recognition (from ST request file) on {faster_whisper_device}")
+        import modules.speech_recognition.faster_whisper_module as whisper_module
+        whisper_module.model = whisper_module.load_model(file_path=whisper_model_path,whisper_device=faster_whisper_device,whisper_compute_type=faster_whisper_type)
+    else:
+        print("Initializing Whisper speech-recognition (from ST request file)")
+        import modules.speech_recognition.whisper_module as whisper_module
+        whisper_module.model = whisper_module.load_model(file_path=whisper_model_path)
 
-    whisper_module.model = whisper_module.load_model(file_path=whisper_model_path)
     app.add_url_rule("/api/speech-recognition/whisper/process-audio", view_func=whisper_module.process_audio, methods=["POST"])
 
 if "streaming-stt" in modules:

From d86a8622c33ff67330d029b92c0353c82a9e1229 Mon Sep 17 00:00:00 2001
From: ArrangingFear56 <arrangingfear56@gmail.com>
Date: Tue, 10 Dec 2024 01:03:49 +0800
Subject: [PATCH 11/15] modified server.py for ease of choosing
 faster-whisper-device

---
 .gitignore                                          | 1 +
 modules/speech_recognition/faster_whisper_module.py | 1 +
 server.py                                           | 6 +++---
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/.gitignore b/.gitignore
index e385523..61324d0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -141,3 +141,4 @@ api_key.txt
 stt_test.wav
 talkinghead/tha3/models
 docker/cache
+launch.bat
diff --git a/modules/speech_recognition/faster_whisper_module.py b/modules/speech_recognition/faster_whisper_module.py
index 50f4c0b..aedb95a 100644
--- a/modules/speech_recognition/faster_whisper_module.py
+++ b/modules/speech_recognition/faster_whisper_module.py
@@ -21,6 +21,7 @@ RECORDING_FILE_PATH = "stt_test.wav"
 
 model_size = "large-v3-turbo"
 
+
 def load_model(file_path=None,whisper_device="cuda",whisper_compute_type='float16'):
     """
     Load given vosk model from file or default to en-us model.
diff --git a/server.py b/server.py
index 6a3548e..38054e0 100644
--- a/server.py
+++ b/server.py
@@ -938,7 +938,7 @@ parser.add_argument("--stt-vosk-model-path", help="Load a custom vosk speech-to-
 parser.add_argument("--stt-whisper-model-path", help="Load a custom whisper speech-to-text model")
 
 parser.add_argument("--use-faster-whisper", action="store_true", help="Choose to use faster-whisper instead of whisper")
-parser.add_argument("--faster-whisper-device", help="Choose between cpu and cuda to run faster-whisper, defaults to cuda")
+parser.add_argument("--faster-whisper-cpu", action="store_true", help="Use cpu to run faster-whisper, saves VRAM but much slower")
 parser.add_argument("--faster-whisper-type", help="Choose faster-whisper compute type, defaults to float16")
 
 # sd_group = parser.add_mutually_exclusive_group()
@@ -1179,8 +1179,8 @@ if "whisper-stt" in modules:
     if whisper_fast:
 
         faster_whisper_device=(
-            args.faster_whisper_device
-            if args.faster_whisper_device
+            "cpu"
+            if args.faster_whisper_cpu
             else "cuda")
 
         faster_whisper_type=(

From 6435ea40f180af9e731d0cc5b6608ee45a026eb0 Mon Sep 17 00:00:00 2001
From: ArrangingFear56 <arrangingfear56@gmail.com>
Date: Tue, 10 Dec 2024 01:23:13 +0800
Subject: [PATCH 12/15] change default computing types for different devices

---
 modules/speech_recognition/faster_whisper_module.py | 11 ++++++++---
 server.py                                           |  4 ++--
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/modules/speech_recognition/faster_whisper_module.py b/modules/speech_recognition/faster_whisper_module.py
index aedb95a..c571257 100644
--- a/modules/speech_recognition/faster_whisper_module.py
+++ b/modules/speech_recognition/faster_whisper_module.py
@@ -22,17 +22,22 @@ RECORDING_FILE_PATH = "stt_test.wav"
 model_size = "large-v3-turbo"
 
 
-def load_model(file_path=None,whisper_device="cuda",whisper_compute_type='float16'):
+def load_model(file_path=None,whisper_device="cuda",whisper_compute_type="auto"):
     """
     Load given vosk model from file or default to en-us model.
     Download model to user cache folder, example: C:/Users/toto/.cache/vosk
     """
+    if whisper_compute_type=="auto":
+        whisper_compute_type=(
+            "int8"
+            if whisper_device=="cpu"
+            else "float16")
 
     if file_path is None:
-        print(f"faster-whisper using {model_size}")
+        print(f"faster-whisper using {model_size} model with {whisper_compute_type}")
         return WhisperModel(model_size, device=whisper_device, compute_type=whisper_compute_type)
     else:
-        print(f"faster-whisper using {file_path}")
+        print(f"faster-whisper using {file_path} model with {whisper_compute_type}")
         return WhisperModel(file_path, device=whisper_device, compute_type=whisper_compute_type)
 
 def process_audio():
diff --git a/server.py b/server.py
index 38054e0..e821bd6 100644
--- a/server.py
+++ b/server.py
@@ -939,7 +939,7 @@ parser.add_argument("--stt-whisper-model-path", help="Load a custom whisper spee
 
 parser.add_argument("--use-faster-whisper", action="store_true", help="Choose to use faster-whisper instead of whisper")
 parser.add_argument("--faster-whisper-cpu", action="store_true", help="Use cpu to run faster-whisper, saves VRAM but much slower")
-parser.add_argument("--faster-whisper-type", help="Choose faster-whisper compute type, defaults to float16")
+parser.add_argument("--faster-whisper-type", help="Choose faster-whisper compute type, defaults to float16 for cuda and int8 for cpu")
 
 # sd_group = parser.add_mutually_exclusive_group()
 
@@ -1186,7 +1186,7 @@ if "whisper-stt" in modules:
         faster_whisper_type=(
             args.faster_whisper_type
             if args.faster_whisper_type
-            else "float16")
+            else "auto")
 
         print(f"Initializing Faster-Whisper speech-recognition (from ST request file) on {faster_whisper_device}")
         import modules.speech_recognition.faster_whisper_module as whisper_module

From 97251b1a0cc89a99f3d0c458d14d69dfe3a1e19f Mon Sep 17 00:00:00 2001
From: ArrangingFear56 <82667894+ArrangingFear56@users.noreply.github.com>
Date: Tue, 10 Dec 2024 01:23:34 +0800
Subject: [PATCH 13/15] Update README.md

---
 README.md | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/README.md b/README.md
index 3767cea..882963e 100644
--- a/README.md
+++ b/README.md
@@ -230,6 +230,19 @@ cd SillyTavern-extras
 | `--sd-remote-ssl`        | Use SSL for the remote SD backend<br>Default: **False** |
 | `--sd-remote-auth`       | Specify the `username:password` for the remote SD backend (if required) |
 
+## For faster-whisper instead of whisper when using whisper(extras)
+1. Install CUDA 12 and cuDNN 8
+2. Install faster-whisper requirements
+```
+pip install -r fasterWhisperRequirements.txt
+```
+use the `--use-faster-whisper` argument to switch to faster-whisper
+
+Optional:
+
+1. use the `--faster-whisper-type` argument to change compute mode for faster-whisper(ex. `--faster-whisper-type=int8`)
+2. use the `--faster-whisper-cpu` argument to switch computing device to cpu
+
 ## Coqui TTS
 
 ### Running on Mac M1

From fe91e0742f6204ea520cc23dae2a591ce0156e35 Mon Sep 17 00:00:00 2001
From: ArrangingFear56 <arrangingfear56@gmail.com>
Date: Wed, 11 Dec 2024 00:19:18 +0800
Subject: [PATCH 14/15] changed name of requirements of faster whisper to fit
 in line with the rest of the requirements

---
 ...irements.txt => requirements-faster-whisper.txt | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)
 rename fasterWhisperRequirements.txt => requirements-faster-whisper.txt (94%)

diff --git a/fasterWhisperRequirements.txt b/requirements-faster-whisper.txt
similarity index 94%
rename from fasterWhisperRequirements.txt
rename to requirements-faster-whisper.txt
index d0aa851..76a9c65 100644
--- a/fasterWhisperRequirements.txt
+++ b/requirements-faster-whisper.txt
@@ -1,7 +1,7 @@
-ctranslate2==4.4.0
-huggingface_hub>=0.13
-tokenizers>=0.13,<1
-onnxruntime>=1.14,<2 
-av>=11
-tqdm
-faster-whisper
+ctranslate2==4.4.0
+huggingface_hub>=0.13
+tokenizers>=0.13,<1
+onnxruntime>=1.14,<2 
+av>=11
+tqdm
+faster-whisper

From 028b798073f45601712c242adf79ce89a4da8952 Mon Sep 17 00:00:00 2001
From: ArrangingFear56 <82667894+ArrangingFear56@users.noreply.github.com>
Date: Wed, 11 Dec 2024 00:21:29 +0800
Subject: [PATCH 15/15] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 882963e..c1dd9b4 100644
--- a/README.md
+++ b/README.md
@@ -234,7 +234,7 @@ cd SillyTavern-extras
 1. Install CUDA 12 and cuDNN 8
 2. Install faster-whisper requirements
 ```
-pip install -r fasterWhisperRequirements.txt
+pip install -r requirements-faster-whisper.txt
 ```
 use the `--use-faster-whisper` argument to switch to faster-whisper