Update to CLIP Interrogator 0.6.0:

- supports latest version of transformers library (no more conflict with Dreambooth extension) - support for BLIP-2 and git-large-coco caption models
2026-04-23 15:59:11 +00:00 · 2023-03-19 23:16:30 -05:00
parent b397678709
commit 6b950289b8
2 changed files with 43 additions and 28 deletions
--- a/install.py
+++ b/install.py
@@ -1,6 +1,6 @@
 import launch

-CI_VERSION = "0.5.4"
+CI_VERSION = "0.6.0"
 needs_install = False

 try:
--- a/scripts/clip_interrogator_ext.py
+++ b/scripts/clip_interrogator_ext.py
@@ -7,13 +7,14 @@ import torch
 from PIL import Image

 import clip_interrogator
-from clip_interrogator import Config, Interrogator
+from clip_interrogator import Config, Interrogator, list_caption_models, list_clip_models

 from modules import devices, lowvram, script_callbacks, shared

-__version__ = '0.0.9'
+__version__ = '0.1.0'

 ci = None
+low_vram = False

 BATCH_OUTPUT_MODES = [
    'Text file for each image',
@@ -49,24 +50,22 @@ class BatchWriter:
            self.file.close()


-def load(clip_model_name):
+def load(clip_model_name, caption_model_name):
    global ci
    if ci is None:
        print(f"Loading CLIP Interrogator {clip_interrogator.__version__}...")
-
-        low_vram = shared.cmd_opts.lowvram or shared.cmd_opts.medvram
-        if not low_vram and torch.cuda.is_available():
-            device = devices.get_optimal_device()
-            vram_total = torch.cuda.get_device_properties(device).total_memory
-            if vram_total < 12*1024*1024*1024:
-                low_vram = True
-                print(f"    detected < 12GB VRAM, using low VRAM mode")
-
-        config = Config(device=devices.get_optimal_device(), clip_model_name=clip_model_name)
+        config = Config(
+            device=devices.get_optimal_device(), 
+            clip_model_name=clip_model_name, 
+            caption_model_name=caption_model_name
+        )
        config.cache_path = 'models/clip-interrogator'
        if low_vram:
            config.apply_low_vram_defaults()
        ci = Interrogator(config)
+    if caption_model_name != ci.config.caption_model_name:
+        ci.config.caption_model_name = caption_model_name
+        ci.load_caption_model()
    if clip_model_name != ci.config.clip_model_name:
        ci.config.clip_model_name = clip_model_name
        ci.load_clip_model()
@@ -75,15 +74,12 @@ def unload():
    global ci
    if ci is not None:
        print("Offloading CLIP Interrogator...")
-        ci.blip_model = ci.blip_model.to(devices.cpu)
+        ci.caption_model = ci.caption_model.to(devices.cpu)
        ci.clip_model = ci.clip_model.to(devices.cpu)
-        ci.blip_offloaded = True
+        ci.caption_offloaded = True
        ci.clip_offloaded = True
        devices.torch_gc()

-def get_models():
-    return ['/'.join(x) for x in open_clip.list_pretrained()]
-
 def image_analysis(image, clip_model_name):
    load(clip_model_name)

@@ -119,7 +115,7 @@ def interrogate(image, mode, caption=None):
        raise Exception(f"Unknown mode {mode}")
    return prompt

-def image_to_prompt(image, mode, clip_model_name):
+def image_to_prompt(image, mode, clip_model_name, caption_model_name):
    shared.state.begin()
    shared.state.job = 'interrogate'

@@ -128,7 +124,7 @@ def image_to_prompt(image, mode, clip_model_name):
            lowvram.send_everything_to_cpu()
            devices.torch_gc()

-        load(clip_model_name)
+        load(clip_model_name, caption_model_name)
        image = image.convert('RGB')
        prompt = interrogate(image, mode)
    except torch.cuda.OutOfMemoryError as e:
@@ -147,8 +143,15 @@ def about_tab():
    gr.Markdown("*Want to figure out what a good prompt might be to create new images like an existing one? The CLIP Interrogator is here to get you answers!*")
    gr.Markdown("## Notes")
    gr.Markdown(
+        "CLIP models:\n"
        "* For best prompts with Stable Diffusion 1.* choose the **ViT-L-14/openai** model.\n"
        "* For best prompts with Stable Diffusion 2.* choose the **ViT-H-14/laion2b_s32b_b79k** model.\n"
+        "\nCaption models:\n"
+        "* blip-large is recommended. use blip-base if you have less than 8GB of VRAM.\n"
+        "* blip-base: 990MB, blip-large: 1.9GB\n"
+        "* git-large-coco: 1.58GB\n"
+        "* blip2-2.7b: 15.5GB, blip2-flan-t5-xl: 15.77GB\n"
+        "\nOther:\n"
        "* When you are done click the **Unload** button to free up memory."
    )
    gr.Markdown("## Github")
@@ -164,7 +167,7 @@ def analyze_tab():
    with gr.Column():
        with gr.Row():
            image = gr.Image(type='pil', label="Image")
-            model = gr.Dropdown(get_models(), value='ViT-L-14/openai', label='CLIP Model')
+            model = gr.Dropdown(list_clip_models(), value='ViT-L-14/openai', label='CLIP Model')
        with gr.Row():
            medium = gr.Label(label="Medium", num_top_classes=5)
            artist = gr.Label(label="Artist", num_top_classes=5)        
@@ -175,7 +178,7 @@ def analyze_tab():
    button.click(image_analysis, inputs=[image, model], outputs=[medium, artist, movement, trending, flavor])

 def batch_tab():
-    def batch_process(folder, model, mode, output_mode):
+    def batch_process(folder, clip_model, caption_model, mode, output_mode):
        if not os.path.exists(folder):
            return f"Folder {folder} does not exist"
        if not os.path.isdir(folder):
@@ -193,7 +196,7 @@ def batch_tab():
                lowvram.send_everything_to_cpu()
                devices.torch_gc()

-            load(model)
+            load(clip_model, caption_model)

            shared.total_tqdm.updateTotal(len(files))
            ci.config.quiet = True
@@ -234,7 +237,8 @@ def batch_tab():
        with gr.Row():
            folder = gr.Text(label="Images folder", value="", interactive=True)
        with gr.Row():
-            model = gr.Dropdown(get_models(), value='ViT-L-14/openai', label='CLIP Model')
+            clip_model = gr.Dropdown(list_clip_models(), value='ViT-L-14/openai', label='CLIP Model')
+            caption_model = gr.Dropdown(list_caption_models(), value='blip-base' if low_vram else 'blip-large', label='Caption Model')
            mode = gr.Radio(['caption', 'best', 'fast', 'classic', 'negative'], label='Prompt Mode', value='fast')
            output_mode = gr.Dropdown(BATCH_OUTPUT_MODES, value=BATCH_OUTPUT_MODES[0], label='Output Mode')
        with gr.Row():        
@@ -242,7 +246,7 @@ def batch_tab():
            interrupt = gr.Button('Interrupt', visible=True)
            interrupt.click(fn=lambda: shared.state.interrupt(), inputs=[], outputs=[])

-    button.click(batch_process, inputs=[folder, model, mode, output_mode], outputs=[])
+    button.click(batch_process, inputs=[folder, clip_model, caption_model, mode, output_mode], outputs=[])

 def prompt_tab():
    with gr.Column():
@@ -250,16 +254,27 @@ def prompt_tab():
            image = gr.Image(type='pil', label="Image")
            with gr.Column():
                mode = gr.Radio(['best', 'fast', 'classic', 'negative'], label='Mode', value='best')
-                model = gr.Dropdown(get_models(), value='ViT-L-14/openai', label='CLIP Model')
+                clip_model = gr.Dropdown(list_clip_models(), value='ViT-L-14/openai', label='CLIP Model')
+                caption_model = gr.Dropdown(list_caption_models(), value='blip-base' if low_vram else 'blip-large', label='Caption Model')
+                list_caption_models
        prompt = gr.Textbox(label="Prompt", lines=3)
    with gr.Row():
        button = gr.Button("Generate", variant='primary')
        unload_button = gr.Button("Unload")
-    button.click(image_to_prompt, inputs=[image, mode, model], outputs=prompt)
+    button.click(image_to_prompt, inputs=[image, mode, clip_model, caption_model], outputs=prompt)
    unload_button.click(unload)


 def add_tab():
+    global low_vram
+    low_vram = shared.cmd_opts.lowvram or shared.cmd_opts.medvram
+    if not low_vram and torch.cuda.is_available():
+        device = devices.get_optimal_device()
+        vram_total = torch.cuda.get_device_properties(device).total_memory
+        if vram_total < 12*1024*1024*1024:
+            low_vram = True
+            print(f"    detected < 12GB VRAM, using low VRAM mode")
+
    with gr.Blocks(analytics_enabled=False) as ui:
        with gr.Tab("Prompt"):
            prompt_tab()