mirror of
https://github.com/p-e-w/heretic.git
synced 2026-05-16 10:29:19 +00:00
feat(ara): remove tie_to_original_matrix term
This term was found experimentally to be 3-4 orders of magnitude smaller than the others in most runs, and have no meaningful effect on the result of the optimization.
This commit is contained in:
@@ -484,11 +484,6 @@ def run():
|
||||
0.0,
|
||||
1.0,
|
||||
)
|
||||
tie_to_original_matrix_weight = trial.suggest_float(
|
||||
"tie_to_original_matrix_weight",
|
||||
0.2, # Minimum to prevent "optimizing" away the regularization term.
|
||||
1.0,
|
||||
)
|
||||
else:
|
||||
direction_scope = trial.suggest_categorical(
|
||||
"direction_scope",
|
||||
@@ -576,7 +571,6 @@ def run():
|
||||
end_layer_index,
|
||||
preserve_good_behavior_weight,
|
||||
steer_bad_behavior_weight,
|
||||
tie_to_original_matrix_weight,
|
||||
)
|
||||
else:
|
||||
print("* Resetting model...")
|
||||
@@ -772,7 +766,6 @@ def run():
|
||||
trial.params["end_layer_index"],
|
||||
trial.params["preserve_good_behavior_weight"],
|
||||
trial.params["steer_bad_behavior_weight"],
|
||||
trial.params["tie_to_original_matrix_weight"],
|
||||
)
|
||||
else:
|
||||
print("* Resetting model...")
|
||||
|
||||
@@ -545,7 +545,6 @@ class Model:
|
||||
end_layer_index: int,
|
||||
preserve_good_behavior_weight: float,
|
||||
steer_bad_behavior_weight: float,
|
||||
tie_to_original_matrix_weight: float,
|
||||
):
|
||||
for layer_index in range(start_layer_index, end_layer_index):
|
||||
for component, modules in self.get_layer_modules(layer_index).items():
|
||||
@@ -589,17 +588,9 @@ class Model:
|
||||
** 2
|
||||
).mean()
|
||||
|
||||
# The matrix itself should change as little as possible overall.
|
||||
# This prevents overfitting due to underdetermination of the
|
||||
# optimization problem from a relatively small number of I/O pairs.
|
||||
tie_to_original_matrix = (
|
||||
(matrix - original_matrix) ** 2
|
||||
).mean()
|
||||
|
||||
return (
|
||||
preserve_good_behavior_weight * preserve_good_behavior
|
||||
+ steer_bad_behavior_weight * steer_bad_behavior
|
||||
+ tie_to_original_matrix_weight * tie_to_original_matrix
|
||||
)
|
||||
|
||||
optimizer = LBFGS(
|
||||
|
||||
@@ -257,7 +257,6 @@ def get_trial_parameters(settings: Settings, trial: Trial) -> dict[str, str]:
|
||||
"end_layer_index": f"{trial.params['end_layer_index']}",
|
||||
"preserve_good_behavior_weight": f"{trial.params['preserve_good_behavior_weight']:.4f}",
|
||||
"steer_bad_behavior_weight": f"{trial.params['steer_bad_behavior_weight']:.4f}",
|
||||
"tie_to_original_matrix_weight": f"{trial.params['tie_to_original_matrix_weight']:.4f}",
|
||||
}
|
||||
else:
|
||||
params = {}
|
||||
|
||||
Reference in New Issue
Block a user