ai-safety-foundation
diff --git a/‎.gitignore
Lines changed: 1 addition & 0 deletions b/‎.gitignore
Lines changed: 1 addition & 0 deletions
diff --git a/‎.vscode/cspell.json
Lines changed: 2 additions & 0 deletions b/‎.vscode/cspell.json
Lines changed: 2 additions & 0 deletions
diff --git a/‎README.md
Lines changed: 3 additions & 3 deletions b/‎README.md
Lines changed: 3 additions & 3 deletions
diff --git a/‎docs/content/flexible_demo.ipynb
Lines changed: 5 additions & 2 deletions b/‎docs/content/flexible_demo.ipynb
Lines changed: 5 additions & 2 deletions
diff --git a/‎sparse_autoencoder/__init__.py
Lines changed: 2 additions & 1 deletion b/‎sparse_autoencoder/__init__.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎sparse_autoencoder/activation_resampler/activation_resampler.py
Lines changed: 1 addition & 1 deletion b/‎sparse_autoencoder/activation_resampler/activation_resampler.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎sparse_autoencoder/activation_resampler/tests/test_activation_resampler.py
Lines changed: 12 additions & 10 deletions b/‎sparse_autoencoder/activation_resampler/tests/test_activation_resampler.py
Lines changed: 12 additions & 10 deletions
diff --git a/‎sparse_autoencoder/autoencoder/abstract_autoencoder.py
Lines changed: 0 additions & 74 deletions b/‎sparse_autoencoder/autoencoder/abstract_autoencoder.py
Lines changed: 0 additions & 74 deletions
diff --git a/‎sparse_autoencoder/autoencoder/components/linear_encoder.py
Lines changed: 12 additions & 27 deletions b/‎sparse_autoencoder/autoencoder/components/linear_encoder.py
Lines changed: 12 additions & 27 deletions
diff --git a/‎sparse_autoencoder/autoencoder/components/tests/test_compare_neel_implementation.py
Lines changed: 20 additions & 12 deletions b/‎sparse_autoencoder/autoencoder/components/tests/test_compare_neel_implementation.py
Lines changed: 20 additions & 12 deletions
@@ -141,6 +141,7 @@ docs/content/reference
 
 # Wandb
 wandb/
+artifacts/
 
 # Scratch files
 scratch.py
 
@@ -9,6 +9,7 @@
         "autocast",
         "autoencoder",
         "autoencoders",
+        "autoencoding",
         "autofix",
         "capturable",
         "categoricalwprobabilities",
@@ -76,6 +77,7 @@
         "optim",
         "penality",
         "perp",
+        "pickleable",
         "polysemantic",
         "polysemantically",
         "polysemanticity",
 
@@ -43,9 +43,9 @@ The library is designed to be modular. By default it takes the approach from [To
 Monosemanticity: Decomposing Language Models With Dictionary Learning
 ](https://transformer-circuits.pub/2023/monosemantic-features/index.html), so you can pip install
 the library and get started quickly. Then when you need to customise something, you can just extend
-the abstract class for that component (e.g. you can extend `AbstractEncoder` if you want to
-customise the encoder layer, and then easily drop it in the standard `SparseAutoencoder` model to
-keep everything else as is. Every component is fully documented, so it's nice and easy to do this.
+the class for that component (e.g. you can extend `SparseAutoencoder` if you want to customise the
+model, and then drop it back into the training pipeline. Every component is fully documented, so
+it's nice and easy to do this.
 
 ## Demo
 
 
@@ -103,6 +103,7 @@
     "    Pipeline,\n",
     "    PreTokenizedDataset,\n",
     "    SparseAutoencoder,\n",
+    "    SparseAutoencoderConfig,\n",
     ")\n",
     "import wandb\n",
     "\n",
@@ -235,8 +236,10 @@
    "source": [
     "expansion_factor = hyperparameters[\"expansion_factor\"]\n",
     "autoencoder = SparseAutoencoder(\n",
-    "    n_input_features=autoencoder_input_dim,  # size of the activations we are autoencoding\n",
-    "    n_learned_features=int(autoencoder_input_dim * expansion_factor),  # size of SAE\n",
+    "    SparseAutoencoderConfig(\n",
+    "        n_input_features=autoencoder_input_dim,  # size of the activations we are autoencoding\n",
+    "        n_learned_features=int(autoencoder_input_dim * expansion_factor),  # size of SAE\n",
+    "    )\n",
     ").to(device)\n",
     "autoencoder"
    ]
 
@@ -1,7 +1,7 @@
 """Sparse Autoencoder Library."""
 from sparse_autoencoder.activation_resampler.activation_resampler import ActivationResampler
 from sparse_autoencoder.activation_store.tensor_store import TensorActivationStore
-from sparse_autoencoder.autoencoder.model import SparseAutoencoder
+from sparse_autoencoder.autoencoder.model import SparseAutoencoder, SparseAutoencoderConfig
 from sparse_autoencoder.loss.abstract_loss import LossReductionType
 from sparse_autoencoder.loss.decoded_activations_l2 import L2ReconstructionLoss
 from sparse_autoencoder.loss.learned_activations_l1 import LearnedActivationsL1Loss
@@ -77,6 +77,7 @@
     "SourceModelHyperparameters",
     "SourceModelRuntimeHyperparameters",
     "SparseAutoencoder",
+    "SparseAutoencoderConfig",
     "sweep",
     "SweepConfig",
     "TensorActivationStore",
 
@@ -19,7 +19,7 @@
 from sparse_autoencoder.autoencoder.model import SparseAutoencoder
 from sparse_autoencoder.loss.abstract_loss import AbstractLoss
 from sparse_autoencoder.tensor_types import Axis
-from sparse_autoencoder.train.utils import get_model_device
+from sparse_autoencoder.train.utils.get_model_device import get_model_device
 
 
 class LossInputActivationsTuple(NamedTuple):
 
@@ -9,7 +9,7 @@
 from sparse_autoencoder.activation_resampler.activation_resampler import ActivationResampler
 from sparse_autoencoder.activation_store.base_store import ActivationStore
 from sparse_autoencoder.activation_store.tensor_store import TensorActivationStore
-from sparse_autoencoder.autoencoder.model import SparseAutoencoder
+from sparse_autoencoder.autoencoder.model import SparseAutoencoder, SparseAutoencoderConfig
 from sparse_autoencoder.loss.decoded_activations_l2 import L2ReconstructionLoss
 from sparse_autoencoder.loss.learned_activations_l1 import LearnedActivationsL1Loss
 from sparse_autoencoder.loss.reducer import LossReducer
@@ -43,9 +43,11 @@ def full_activation_store() -> ActivationStore:
 def autoencoder_model() -> SparseAutoencoder:
     """Create a dummy autoencoder model."""
     return SparseAutoencoder(
-        n_components=DEFAULT_N_COMPONENTS,
-        n_input_features=DEFAULT_N_INPUT_FEATURES,
-        n_learned_features=DEFAULT_N_LEARNED_FEATURES,
+        SparseAutoencoderConfig(
+            n_input_features=DEFAULT_N_INPUT_FEATURES,
+            n_learned_features=DEFAULT_N_LEARNED_FEATURES,
+            n_components=DEFAULT_N_COMPONENTS,
+        )
     )
 
 
@@ -126,7 +128,7 @@ def test_more_items_than_in_store_error(
         ):
             ActivationResampler(
                 resample_dataset_size=DEFAULT_N_ACTIVATIONS_STORE + 1,
-                n_learned_features=autoencoder_model.n_learned_features,
+                n_learned_features=DEFAULT_N_LEARNED_FEATURES,
             ).compute_loss_and_get_activations(
                 store=full_activation_store,
                 autoencoder=autoencoder_model,
@@ -285,7 +287,7 @@ def test_no_changes_if_no_dead_neurons(
             resample_interval=10,
             n_components=DEFAULT_N_COMPONENTS,
             n_activations_activity_collate=10,
-            n_learned_features=autoencoder_model.n_learned_features,
+            n_learned_features=DEFAULT_N_LEARNED_FEATURES,
             resample_dataset_size=100,
         )
         updates = resampler.step_resampler(
@@ -328,7 +330,7 @@ def test_updates_dead_neuron_parameters(
             resample_interval=10,
             n_activations_activity_collate=10,
             n_components=DEFAULT_N_COMPONENTS,
-            n_learned_features=autoencoder_model.n_learned_features,
+            n_learned_features=DEFAULT_N_LEARNED_FEATURES,
             resample_dataset_size=100,
         )
         parameter_updates = resampler.step_resampler(
@@ -343,7 +345,7 @@ def test_updates_dead_neuron_parameters(
         # Check the updated ones have changed
         for component_idx, neuron_idx in dead_neurons:
             # Decoder
-            decoder_weights = current_parameters["decoder._weight"]
+            decoder_weights = current_parameters["decoder.weight"]
             current_dead_neuron_weights = decoder_weights[component_idx, neuron_idx]
             updated_dead_decoder_weights = parameter_updates[
                 component_idx
@@ -353,7 +355,7 @@ def test_updates_dead_neuron_parameters(
             ), "Dead decoder weights should have changed."
 
             # Encoder
-            current_dead_encoder_weights = current_parameters["encoder._weight"][
+            current_dead_encoder_weights = current_parameters["encoder.weight"][
                 component_idx, neuron_idx
             ]
             updated_dead_encoder_weights = parameter_updates[
@@ -363,7 +365,7 @@ def test_updates_dead_neuron_parameters(
                 current_dead_encoder_weights, updated_dead_encoder_weights
             ), "Dead encoder weights should have changed."
 
-            current_dead_encoder_bias = current_parameters["encoder._bias"][
+            current_dead_encoder_bias = current_parameters["encoder.bias"][
                 component_idx, neuron_idx
             ]
             updated_dead_encoder_bias = parameter_updates[component_idx].dead_encoder_bias_updates
 
@@ -42,33 +42,18 @@ class LinearEncoder(Module):
 
     _n_components: int | None
 
-    _weight: Float[
+    weight: Float[
         Parameter,
         Axis.names(Axis.COMPONENT_OPTIONAL, Axis.LEARNT_FEATURE, Axis.INPUT_OUTPUT_FEATURE),
     ]
-    """Weight parameter internal state."""
+    """Weight parameter.
 
-    _bias: Float[Parameter, Axis.names(Axis.COMPONENT_OPTIONAL, Axis.LEARNT_FEATURE)]
-    """Bias parameter internal state."""
-
-    @property
-    def weight(
-        self,
-    ) -> Float[
-        Parameter,
-        Axis.names(Axis.COMPONENT_OPTIONAL, Axis.LEARNT_FEATURE, Axis.INPUT_OUTPUT_FEATURE),
-    ]:
-        """Weight parameter.
-
-        Each row in the weights matrix acts as a dictionary vector, representing a single basis
-        element in the learned activation space.
-        """
-        return self._weight
+    Each row in the weights matrix acts as a dictionary vector, representing a single basis
+    element in the learned activation space.
+    """
 
-    @property
-    def bias(self) -> Float[Parameter, Axis.names(Axis.COMPONENT_OPTIONAL, Axis.LEARNT_FEATURE)]:
-        """Bias parameter."""
-        return self._bias
+    bias: Float[Parameter, Axis.names(Axis.COMPONENT_OPTIONAL, Axis.LEARNT_FEATURE)]
+    """Bias parameter."""
 
     @property
     def reset_optimizer_parameter_details(self) -> list[ResetOptimizerParameterDetails]:
@@ -109,12 +94,12 @@ def __init__(
         self._input_features = input_features
         self._n_components = n_components
 
-        self._weight = Parameter(
+        self.weight = Parameter(
             torch.empty(
                 shape_with_optional_dimensions(n_components, learnt_features, input_features),
             )
         )
-        self._bias = Parameter(
+        self.bias = Parameter(
             torch.zeros(shape_with_optional_dimensions(n_components, learnt_features))
         )
         self.activation_function = ReLU()
@@ -125,12 +110,12 @@ def reset_parameters(self) -> None:
         """Initialize or reset the parameters."""
         # Assumes we are using ReLU activation function (for e.g. leaky ReLU, the `a` parameter and
         # `nonlinerity` must be changed.
-        init.kaiming_uniform_(self._weight, nonlinearity="relu")
+        init.kaiming_uniform_(self.weight, nonlinearity="relu")
 
         # Bias (approach from nn.Linear)
-        fan_in = self._weight.size(1)
+        fan_in = self.weight.size(1)
         bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
-        init.uniform_(self._bias, -bound, bound)
+        init.uniform_(self.bias, -bound, bound)
 
     def forward(
         self,
 
@@ -5,7 +5,7 @@
 import torch
 from torch import nn
 
-from sparse_autoencoder.autoencoder.model import SparseAutoencoder
+from sparse_autoencoder.autoencoder.model import SparseAutoencoder, SparseAutoencoderConfig
 
 
 class NeelAutoencoder(nn.Module):
@@ -66,8 +66,10 @@ def test_biases_initialised_same_way() -> None:
 
     torch.random.manual_seed(0)
     autoencoder = SparseAutoencoder(
-        n_input_features=n_input_features,
-        n_learned_features=n_learned_features,
+        SparseAutoencoderConfig(
+            n_input_features=n_input_features,
+            n_learned_features=n_learned_features,
+        )
     )
 
     torch.random.manual_seed(0)
@@ -91,8 +93,10 @@ def test_forward_pass_same_weights() -> None:
     l1_coefficient: float = 0.01
 
     autoencoder = SparseAutoencoder(
-        n_input_features=n_input_features,
-        n_learned_features=n_learned_features,
+        SparseAutoencoderConfig(
+            n_input_features=n_input_features,
+            n_learned_features=n_learned_features,
+        )
     )
     neel_autoencoder = NeelAutoencoder(
         d_hidden=n_learned_features,
@@ -122,8 +126,10 @@ def test_unit_norm_weights() -> None:
     l1_coefficient: float = 0.01
 
     autoencoder = SparseAutoencoder(
-        n_input_features=n_input_features,
-        n_learned_features=n_learned_features,
+        SparseAutoencoderConfig(
+            n_input_features=n_input_features,
+            n_learned_features=n_learned_features,
+        )
     )
     neel_autoencoder = NeelAutoencoder(
         d_hidden=n_learned_features,
@@ -135,7 +141,7 @@ def test_unit_norm_weights() -> None:
 
     # Set the same decoder weights
     decoder_weights = torch.rand_like(autoencoder.decoder.weight)
-    autoencoder.decoder._weight.data = decoder_weights  # noqa: SLF001 # type: ignore
+    autoencoder.decoder.weight.data = decoder_weights  # type: ignore
     neel_autoencoder.W_dec.data = decoder_weights.T
 
     # Do a forward & backward pass so we have gradients
@@ -165,8 +171,10 @@ def test_unit_norm_weights_grad() -> None:
     l1_coefficient: float = 0.01
 
     autoencoder = SparseAutoencoder(
-        n_input_features=n_input_features,
-        n_learned_features=n_learned_features,
+        SparseAutoencoderConfig(
+            n_input_features=n_input_features,
+            n_learned_features=n_learned_features,
+        )
     )
     neel_autoencoder = NeelAutoencoder(
         d_hidden=n_learned_features,
@@ -176,9 +184,9 @@ def test_unit_norm_weights_grad() -> None:
 
     # Set the same decoder weights
     decoder_weights = torch.rand_like(autoencoder.decoder.weight)
-    autoencoder.decoder._weight.data = decoder_weights  # noqa: SLF001 # type: ignore
+    autoencoder.decoder.weight.data = decoder_weights  # type: ignore
     neel_autoencoder.W_dec.data = decoder_weights.T
-    autoencoder.decoder._weight.grad = torch.zeros_like(autoencoder.decoder.weight)  # noqa: SLF001 # type: ignore
+    autoencoder.decoder.weight.grad = torch.zeros_like(autoencoder.decoder.weight)  # type: ignore
     neel_autoencoder.W_dec.grad = torch.zeros_like(neel_autoencoder.W_dec)
 
     # Set the same tied bias weights