mirror of
https://github.com/ostris/ai-toolkit.git
synced 2026-03-11 13:39:50 +00:00
WIP on SAFE encoder. Work on fp16 training improvements. Various other tweaks and improvements
This commit is contained in:
@@ -20,12 +20,12 @@ def get_optimizer(
|
||||
# dadaptation uses different lr that is values of 0.1 to 1.0. default to 1.0
|
||||
use_lr = 1.0
|
||||
if lower_type.endswith('lion'):
|
||||
optimizer = dadaptation.DAdaptLion(params, lr=use_lr, **optimizer_params)
|
||||
optimizer = dadaptation.DAdaptLion(params, eps=1e-6, lr=use_lr, **optimizer_params)
|
||||
elif lower_type.endswith('adam'):
|
||||
optimizer = dadaptation.DAdaptLion(params, lr=use_lr, **optimizer_params)
|
||||
optimizer = dadaptation.DAdaptLion(params, eps=1e-6, lr=use_lr, **optimizer_params)
|
||||
elif lower_type == 'dadaptation':
|
||||
# backwards compatibility
|
||||
optimizer = dadaptation.DAdaptAdam(params, lr=use_lr, **optimizer_params)
|
||||
optimizer = dadaptation.DAdaptAdam(params, eps=1e-6, lr=use_lr, **optimizer_params)
|
||||
# warn user that dadaptation is deprecated
|
||||
print("WARNING: Dadaptation optimizer type has been changed to DadaptationAdam. Please update your config.")
|
||||
elif lower_type.startswith("prodigy"):
|
||||
@@ -40,22 +40,22 @@ def get_optimizer(
|
||||
print(f"Using lr {use_lr}")
|
||||
# let net be the neural network you want to train
|
||||
# you can choose weight decay value based on your problem, 0 by default
|
||||
optimizer = Prodigy(params, lr=use_lr, **optimizer_params)
|
||||
optimizer = Prodigy(params, lr=use_lr, eps=1e-6, **optimizer_params)
|
||||
elif lower_type.endswith("8bit"):
|
||||
import bitsandbytes
|
||||
|
||||
if lower_type == "adam8bit":
|
||||
return bitsandbytes.optim.Adam8bit(params, lr=learning_rate, **optimizer_params)
|
||||
return bitsandbytes.optim.Adam8bit(params, lr=learning_rate, eps=1e-6, **optimizer_params)
|
||||
elif lower_type == "adamw8bit":
|
||||
return bitsandbytes.optim.AdamW8bit(params, lr=learning_rate, **optimizer_params)
|
||||
return bitsandbytes.optim.AdamW8bit(params, lr=learning_rate, eps=1e-6, **optimizer_params)
|
||||
elif lower_type == "lion8bit":
|
||||
return bitsandbytes.optim.Lion8bit(params, lr=learning_rate, **optimizer_params)
|
||||
else:
|
||||
raise ValueError(f'Unknown optimizer type {optimizer_type}')
|
||||
elif lower_type == 'adam':
|
||||
optimizer = torch.optim.Adam(params, lr=float(learning_rate), **optimizer_params)
|
||||
optimizer = torch.optim.Adam(params, lr=float(learning_rate), eps=1e-6, **optimizer_params)
|
||||
elif lower_type == 'adamw':
|
||||
optimizer = torch.optim.AdamW(params, lr=float(learning_rate), **optimizer_params)
|
||||
optimizer = torch.optim.AdamW(params, lr=float(learning_rate), eps=1e-6, **optimizer_params)
|
||||
elif lower_type == 'lion':
|
||||
try:
|
||||
from lion_pytorch import Lion
|
||||
@@ -63,7 +63,7 @@ def get_optimizer(
|
||||
except ImportError:
|
||||
raise ImportError("Please install lion_pytorch to use Lion optimizer -> pip install lion-pytorch")
|
||||
elif lower_type == 'adagrad':
|
||||
optimizer = torch.optim.Adagrad(params, lr=float(learning_rate), **optimizer_params)
|
||||
optimizer = torch.optim.Adagrad(params, lr=float(learning_rate), eps=1e-6, **optimizer_params)
|
||||
elif lower_type == 'adafactor':
|
||||
# hack in stochastic rounding
|
||||
if 'relative_step' not in optimizer_params:
|
||||
@@ -72,7 +72,7 @@ def get_optimizer(
|
||||
optimizer_params['scale_parameter'] = True
|
||||
if 'warmup_init' not in optimizer_params:
|
||||
optimizer_params['warmup_init'] = False
|
||||
optimizer = Adafactor(params, lr=float(learning_rate), **optimizer_params)
|
||||
optimizer = Adafactor(params, lr=float(learning_rate), eps=1e-6, **optimizer_params)
|
||||
from toolkit.util.adafactor_stochastic_rounding import step_adafactor
|
||||
optimizer.step = step_adafactor.__get__(optimizer, Adafactor)
|
||||
else:
|
||||
|
||||
Reference in New Issue
Block a user