diff --git a/src/heretic/main.py b/src/heretic/main.py index 568b24c..e6207c3 100644 --- a/src/heretic/main.py +++ b/src/heretic/main.py @@ -484,11 +484,6 @@ def run(): 0.0, 1.0, ) - tie_to_original_matrix_weight = trial.suggest_float( - "tie_to_original_matrix_weight", - 0.2, # Minimum to prevent "optimizing" away the regularization term. - 1.0, - ) else: direction_scope = trial.suggest_categorical( "direction_scope", @@ -576,7 +571,6 @@ def run(): end_layer_index, preserve_good_behavior_weight, steer_bad_behavior_weight, - tie_to_original_matrix_weight, ) else: print("* Resetting model...") @@ -772,7 +766,6 @@ def run(): trial.params["end_layer_index"], trial.params["preserve_good_behavior_weight"], trial.params["steer_bad_behavior_weight"], - trial.params["tie_to_original_matrix_weight"], ) else: print("* Resetting model...") diff --git a/src/heretic/model.py b/src/heretic/model.py index 45a8235..91f5953 100644 --- a/src/heretic/model.py +++ b/src/heretic/model.py @@ -545,7 +545,6 @@ class Model: end_layer_index: int, preserve_good_behavior_weight: float, steer_bad_behavior_weight: float, - tie_to_original_matrix_weight: float, ): for layer_index in range(start_layer_index, end_layer_index): for component, modules in self.get_layer_modules(layer_index).items(): @@ -589,17 +588,9 @@ class Model: ** 2 ).mean() - # The matrix itself should change as little as possible overall. - # This prevents overfitting due to underdetermination of the - # optimization problem from a relatively small number of I/O pairs. - tie_to_original_matrix = ( - (matrix - original_matrix) ** 2 - ).mean() - return ( preserve_good_behavior_weight * preserve_good_behavior + steer_bad_behavior_weight * steer_bad_behavior - + tie_to_original_matrix_weight * tie_to_original_matrix ) optimizer = LBFGS( diff --git a/src/heretic/utils.py b/src/heretic/utils.py index a7748f1..dd3a176 100644 --- a/src/heretic/utils.py +++ b/src/heretic/utils.py @@ -257,7 +257,6 @@ def get_trial_parameters(settings: Settings, trial: Trial) -> dict[str, str]: "end_layer_index": f"{trial.params['end_layer_index']}", "preserve_good_behavior_weight": f"{trial.params['preserve_good_behavior_weight']:.4f}", "steer_bad_behavior_weight": f"{trial.params['steer_bad_behavior_weight']:.4f}", - "tie_to_original_matrix_weight": f"{trial.params['tie_to_original_matrix_weight']:.4f}", } else: params = {}