From 4c4a10d439afc72f29797974b836f91964a5cdb5 Mon Sep 17 00:00:00 2001
From: Jaret Burkett <jaretburkett@gmail.com>
Date: Wed, 6 Aug 2025 11:40:02 -0600
Subject: [PATCH] Remove vision model from qwen text encoder as it is not
 needed for image generation currently

---
 extensions_built_in/diffusion_models/qwen_image/qwen_image.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/extensions_built_in/diffusion_models/qwen_image/qwen_image.py b/extensions_built_in/diffusion_models/qwen_image/qwen_image.py
index f7607ccb..bcd42ed3 100644
--- a/extensions_built_in/diffusion_models/qwen_image/qwen_image.py
+++ b/extensions_built_in/diffusion_models/qwen_image/qwen_image.py
@@ -130,6 +130,10 @@ class QwenImageModel(BaseModel):
         text_encoder = Qwen2_5_VLForConditionalGeneration.from_pretrained(
             base_model_path, subfolder="text_encoder", torch_dtype=dtype
         )
+        
+        # remove the visual model as it is not needed for image generation
+        text_encoder.model.visual = None
+        
         text_encoder.to(self.device_torch, dtype=dtype)
         flush()