API + Model: Add new parameters and clean up documentation

The example JSON fields were changed because of the new sampler
default strategy. Fix these by manually changing the values.

Also add support for fasttensors and expose generate_window to
the API. It's recommended to not adjust generate_window as it's
dynamically scaled based on max_seq_len by default.

Signed-off-by: kingbri <bdashore3@proton.me>
This commit is contained in:
kingbri
2024-01-25 00:11:30 -05:00
committed by Brian Dashore
parent 90fb41a77a
commit fc4570220c
4 changed files with 45 additions and 10 deletions

View File

@@ -138,13 +138,25 @@ class ExllamaV2Container:
kwargs.get("rope_alpha"), self.calculate_rope_alpha(base_seq_len)
)
# Enable CFG if present
use_cfg = unwrap(kwargs.get("use_cfg"), False)
if hasattr(ExLlamaV2Sampler.Settings, "cfg_scale"):
self.use_cfg = unwrap(kwargs.get("use_cfg"), False)
else:
self.use_cfg = use_cfg
elif use_cfg:
logger.warning(
"CFG is not supported by the currently installed ExLlamaV2 version."
)
# Enable fasttensors loading if present
use_fasttensors = unwrap(kwargs.get("fasttensors"), False)
if hasattr(ExLlamaV2Config, "fasttensors"):
self.config.fasttensors = use_fasttensors
elif use_fasttensors:
logger.warning(
"fasttensors is not supported by "
"the currently installed ExllamaV2 version."
)
# Turn off flash attention if CFG is on
# Workaround until batched FA2 is fixed in exllamav2 upstream
self.config.no_flash_attn = (
@@ -668,6 +680,7 @@ class ExllamaV2Container:
**vars(gen_settings),
token_healing=token_healing,
auto_scale_penalty_range=auto_scale_penalty_range,
generate_window=generate_window,
add_bos_token=add_bos_token,
ban_eos_token=ban_eos_token,
stop_conditions=stop_conditions,