feat(ara): remove tie_to_original_matrix term

This term was found experimentally to be 3-4 orders of magnitude smaller than the others in most runs, and have no meaningful effect on the result of the optimization.
2026-05-16 10:29:19 +00:00 · 2026-03-04 09:13:38 +05:30
parent 3c5d6920bf
commit bd1fa0ade4
3 changed files with 0 additions and 17 deletions
--- a/src/heretic/main.py
+++ b/src/heretic/main.py
@@ -484,11 +484,6 @@ def run():
                0.0,
                1.0,
            )
-            tie_to_original_matrix_weight = trial.suggest_float(
-                "tie_to_original_matrix_weight",
-                0.2,  # Minimum to prevent "optimizing" away the regularization term.
-                1.0,
-            )
        else:
            direction_scope = trial.suggest_categorical(
                "direction_scope",
@@ -576,7 +571,6 @@ def run():
                end_layer_index,
                preserve_good_behavior_weight,
                steer_bad_behavior_weight,
-                tie_to_original_matrix_weight,
            )
        else:
            print("* Resetting model...")
@@ -772,7 +766,6 @@ def run():
                    trial.params["end_layer_index"],
                    trial.params["preserve_good_behavior_weight"],
                    trial.params["steer_bad_behavior_weight"],
-                    trial.params["tie_to_original_matrix_weight"],
                )
            else:
                print("* Resetting model...")
--- a/src/heretic/model.py
+++ b/src/heretic/model.py
@@ -545,7 +545,6 @@ class Model:
        end_layer_index: int,
        preserve_good_behavior_weight: float,
        steer_bad_behavior_weight: float,
-        tie_to_original_matrix_weight: float,
    ):
        for layer_index in range(start_layer_index, end_layer_index):
            for component, modules in self.get_layer_modules(layer_index).items():
@@ -589,17 +588,9 @@ class Model:
                            ** 2
                        ).mean()

-                        # The matrix itself should change as little as possible overall.
-                        # This prevents overfitting due to underdetermination of the
-                        # optimization problem from a relatively small number of I/O pairs.
-                        tie_to_original_matrix = (
-                            (matrix - original_matrix) ** 2
-                        ).mean()
-
                        return (
                            preserve_good_behavior_weight * preserve_good_behavior
                            + steer_bad_behavior_weight * steer_bad_behavior
-                            + tie_to_original_matrix_weight * tie_to_original_matrix
                        )

                    optimizer = LBFGS(
--- a/src/heretic/utils.py
+++ b/src/heretic/utils.py
@@ -257,7 +257,6 @@ def get_trial_parameters(settings: Settings, trial: Trial) -> dict[str, str]:
            "end_layer_index": f"{trial.params['end_layer_index']}",
            "preserve_good_behavior_weight": f"{trial.params['preserve_good_behavior_weight']:.4f}",
            "steer_bad_behavior_weight": f"{trial.params['steer_bad_behavior_weight']:.4f}",
-            "tie_to_original_matrix_weight": f"{trial.params['tie_to_original_matrix_weight']:.4f}",
        }
    else:
        params = {}