Add Qwen2.5 mode to grounding demo

2026-04-20 14:29:28 +00:00 · 2025-01-29 22:39:38 +01:00
parent cce6f95cd3
commit 96b2f9df77
1 changed files with 73 additions and 7 deletions
--- a/examples/multimodal_grounding_qwen.py
+++ b/examples/multimodal_grounding_qwen.py
@@ -51,7 +51,7 @@ class Model:
    current_image: Image or None = None
    current_description: str

-    def __init__(self, model_directory):
+    def __init__(self, model_directory, bbox_mode: str):
        self.model_directory = model_directory
        self.config = None
        self.vision_model = None
@@ -61,17 +61,22 @@ class Model:
        self.current_image = None
        self.current_emb = None
        self.current_description = ""
+        bbox_funcs = {
+            "qwen2": self.get_grounding_bb_qwen2,
+            "qwen25": self.get_grounding_bb_qwen25,
+        }
+        self.bbox_func = bbox_funcs[bbox_mode]

    def load(self):
        """Load and initialize the things"""
        self.config = ExLlamaV2Config(self.model_directory)
-        self.config.max_seq_len = 16384
+        self.config.max_seq_len = 8192

        self.vision_model = ExLlamaV2VisionTower(self.config)
        self.vision_model.load(progress = True)

        self.model = ExLlamaV2(self.config)
-        self.cache = ExLlamaV2Cache(self.model, lazy = True, max_seq_len = 16384)
+        self.cache = ExLlamaV2Cache(self.model, lazy = True, max_seq_len = 32768)
        self.model.load_autosplit(self.cache, progress = True)
        self.tokenizer = ExLlamaV2Tokenizer(self.config)

@@ -148,6 +153,13 @@ class Model:
                lastupdate = time.time()
                settext_fn(text)
                update_fn()
+#
+#         text = \
+# """And you may find yourself living in a shotgun shack
+# And you may find yourself in another part of the world
+# And you may find yourself behind the wheel of a large automobile
+# And you may find yourself in a beautiful house, with a beautiful wife
+# And you may ask yourself, "Well, how did I get here?\""""

        settext_fn(text)
        update_fn()
@@ -155,7 +167,7 @@ class Model:
        print("Image description from model:")
        print(text)

-    def get_grounding_bb(self, start, end) -> tuple:
+    def get_grounding_bb_qwen2(self, start, end) -> tuple:
        """
        Prompt the model again and try to extraxt the bounding box of the image details indicated by selected portion
        of the description. We do this by repeating the exact same prompt up to and including the selected text, but
@@ -209,6 +221,55 @@ class Model:

        return a, b

+    def get_grounding_bb_qwen25(self, start, end) -> tuple:
+        """
+        Qwen2.5 works the same way, except the coordinates are no longer normalized and the format is:
+        "(x0,y0,x1,y1)"
+        """
+
+        if start >= end:
+            return None, None
+
+        # Including leading space
+        if start > 0 and self.current_description[start - 1] == " ":
+            start -= 1
+
+        # Repeat the same prompt up to the selection, with grounding tokens added
+        prompt = self.get_prompt()
+        prompt += self.current_description[:start]
+        prompt += "<|object_ref_start|>"
+        prompt += self.current_description[start:end]
+        prompt += "<|object_ref_end|><|box_start|>("
+
+        bb_string, res = self.generator.generate(
+            prompt = prompt,
+            add_bos = True,
+            max_new_tokens = 28,
+            stop_conditions = [self.tokenizer.single_id("<|box_end|>")],
+            gen_settings = ExLlamaV2Sampler.Settings.greedy(),
+            embeddings = [self.current_emb],
+            completion_only = True,
+            return_last_results = True,  # debug purposes
+        )
+        bb_string = "(" + bb_string
+
+        print(f"Generation: {bb_string}")
+        pprint.pprint(res, indent = 4)
+
+        # BB string is in the format "(x0,y0,x1,y1)" with integer coordinates
+
+        s = self.current_image.size
+        try:
+            d = tuple(map(int, bb_string.strip("()").split(",")))
+            a = (d[0] / s[0], d[1] / s[1])
+            b = (d[2] / s[0], d[3] / s[1])
+        except:
+            print("No bounding box could be determined")
+            a, b = None, None
+
+        return a, b
+
+

 class GroundingDemo(QMainWindow):

@@ -472,7 +533,7 @@ class GroundingDemo(QMainWindow):

        print(f"Selected span: {start}, {end}")
        print(f"Selected text: {repr(self.model.current_description[start:end])}")
-        a, b = self.model.get_grounding_bb(start, end)
+        a, b = self.model.bbox_func(start, end)
        self.image_label.set_bounding_box(a, b)


@@ -481,9 +542,14 @@ class GroundingDemo(QMainWindow):
 #   https://huggingface.co/turboderp/Qwen2-VL-7B-Instruct-exl2

 def main():
-    model_dir = "/mnt/str/models/qwen2-vl-7b-instruct-exl2/6.0bpw"
+
+    # model_dir = "/mnt/str/models/qwen2-vl-7b-instruct-exl2/6.0bpw"
+    # bbox_mode = "qwen25"
+    model_dir = "/mnt/str/models/qwen2.5-vl-7b-instruct-exl2/6.0bpw"
+    bbox_mode = "qwen25"
+
    app = QApplication(sys.argv)
-    model = Model(model_dir)
+    model = Model(model_dir, bbox_mode)
    model.load()
    window = GroundingDemo(model, model_dir)
    window.show()