mirror of
https://github.com/theroyallab/tabbyAPI.git
synced 2026-04-28 10:11:39 +00:00
Merge pull request #244 from DocShotgun/draft-flash-attn-fix
Fix draft model non-FA2 fallback
This commit is contained in:
@@ -159,7 +159,6 @@ class ExllamaV2Container:
|
|||||||
|
|
||||||
if enable_draft:
|
if enable_draft:
|
||||||
self.draft_config = ExLlamaV2Config()
|
self.draft_config = ExLlamaV2Config()
|
||||||
self.draft_config.no_flash_attn = self.config.no_flash_attn
|
|
||||||
draft_model_path = pathlib.Path(
|
draft_model_path = pathlib.Path(
|
||||||
unwrap(draft_args.get("draft_model_dir"), "models")
|
unwrap(draft_args.get("draft_model_dir"), "models")
|
||||||
)
|
)
|
||||||
@@ -253,6 +252,8 @@ class ExllamaV2Container:
|
|||||||
or not supports_paged_attn()
|
or not supports_paged_attn()
|
||||||
):
|
):
|
||||||
self.config.no_flash_attn = True
|
self.config.no_flash_attn = True
|
||||||
|
if self.draft_config:
|
||||||
|
self.draft_config.no_flash_attn = True
|
||||||
self.paged = False
|
self.paged = False
|
||||||
self.max_batch_size = 1
|
self.max_batch_size = 1
|
||||||
torch.backends.cuda.enable_flash_sdp(False)
|
torch.backends.cuda.enable_flash_sdp(False)
|
||||||
|
|||||||
Reference in New Issue
Block a user