fix e2e test

luciaquirke · luciaquirke · commit 1a95cd685dab · 2025-04-21T04:33:37.000Z
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -16,3 +16,7 @@ repos:
     hooks:
         - id: ruff
           args: [--fix, --exit-non-zero-on-fix]
+-   repo: https://github.com/RobertCraigie/pyright-python
+    rev: v1.1.399
+    hooks:
+    -   id: pyright
diff --git a/delphi/log/result_analysis.py b/delphi/log/result_analysis.py
@@ -27,9 +27,9 @@ def compute_auc(df: pd.DataFrame) -> float | None:
     if not df.probability.nunique():
         return None
 
-    df = df[df.probability.notna()]
+    valid_df = df[df.probability.notna()]
 
-    return roc_auc_score(df.activating, df.probability)  # type: ignore
+    return roc_auc_score(valid_df.activating, valid_df.probability)  # type: ignore
 
 
 def plot_accuracy_hist(df: pd.DataFrame, out_dir: Path):
@@ -49,10 +49,10 @@ def plot_roc_curve(df: pd.DataFrame, out_dir: Path):
         return
 
     # filter out NANs
-    df = df[df.probability.notna()]
+    valid_df = df[df.probability.notna()]
 
-    fpr, tpr, _ = roc_curve(df.activating, df.probability)
-    auc = roc_auc_score(df.activating, df.probability)
+    fpr, tpr, _ = roc_curve(valid_df.activating, valid_df.probability)
+    auc = roc_auc_score(valid_df.activating, valid_df.probability)
     fig = go.Figure(
         data=[
             go.Scatter(x=fpr, y=tpr, mode="lines", name=f"ROC (AUC={auc:.3f})"),
@@ -173,6 +173,19 @@ def parse_score_file(path: Path) -> pd.DataFrame:
     return pd.concat(latent_dfs, ignore_index=True), counts
 
 
+def get_metrics(latent_df: pd.DataFrame) -> pd.DataFrame:
+    processed_rows = []
+    for score_type, group_df in latent_df.groupby("score_type"):
+        conf = compute_confusion(group_df)
+        class_m = compute_classification_metrics(conf)
+        auc = compute_auc(group_df)
+
+        row = {"score_type": score_type, **conf, **class_m, "auc": auc}
+        processed_rows.append(row)
+
+    return pd.DataFrame(processed_rows)
+
+
 def log_results(scores_path: Path, viz_path: Path, modules: list[str]):
     import_plotly()
 
@@ -187,17 +200,7 @@ def log_results(scores_path: Path, viz_path: Path, modules: list[str]):
 
     plot_roc_curve(latent_df, viz_path)
 
-    # Produce statistics averaged over layers and latents
-    processed_rows = []
-    for score_type, group_df in latent_df.groupby("score_type"):
-        conf = compute_confusion(group_df)
-        class_m = compute_classification_metrics(conf)
-        auc = compute_auc(group_df)
-
-        row = {"score_type": score_type, **conf, **class_m, "auc": auc}
-        processed_rows.append(row)
-
-    processed_df = pd.DataFrame(processed_rows)
+    processed_df = get_metrics(latent_df)
 
     plot_accuracy_hist(processed_df, viz_path)
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -10,7 +10,9 @@ readme = "README.md"
 requires-python = ">=3.10"
 keywords = ["interpretability", "explainable-ai"]
 dependencies = [
+    "torch",
     "datasets",
+    "transformers",
     "orjson",
     "eai-sparsify",
     "safetensors",
diff --git a/pyrightconfig.json b/pyrightconfig.json
@@ -0,0 +1,11 @@
+{
+    "include": ["delphi"],
+    "exclude": [
+        "**/node_modules",
+        "**/__pycache__"
+    ],
+    "reportMissingImports": "none",
+    "reportMissingModuleSource": "none",
+    "pythonVersion": "3.10",
+    "typeCheckingMode": "basic"
+}
diff --git a/tests/e2e.py b/tests/e2e.py
@@ -6,7 +6,7 @@
 
 from delphi.__main__ import run
 from delphi.config import CacheConfig, ConstructorConfig, RunConfig, SamplerConfig
-from delphi.log.result_analysis import build_scores_df, latent_balanced_score_metrics
+from delphi.log.result_analysis import get_metrics, load_data
 
 
 async def test():
@@ -58,21 +58,14 @@ async def test():
     end_time = time.time()
     print(f"Time taken: {end_time - start_time} seconds")
 
-    # Performs better than random guessing
     scores_path = Path.cwd() / "results" / run_cfg.name / "scores"
-    hookpoint_firing_counts = torch.load(
-        Path.cwd() / "results" / run_cfg.name / "log" / "hookpoint_firing_counts.pt",
-        weights_only=True,
-    )
-    df = build_scores_df(scores_path, run_cfg.hookpoints, hookpoint_firing_counts)
-    for score_type in df["score_type"].unique():
-        score_df = df.query(f"score_type == '{score_type}'")
 
-        weighted_mean_metrics = latent_balanced_score_metrics(
-            score_df, score_type, verbose=False
-        )
+    latent_df, _ = load_data(scores_path, run_cfg.hookpoints)
+    processed_df = get_metrics(latent_df)
 
-        accuracy = weighted_mean_metrics["accuracy"]
+    # Performs better than random guessing
+    for score_type, df in processed_df.groupby("score_type"):
+        accuracy = df["accuracy"].mean()
         assert accuracy > 0.55, f"Score type {score_type} has an accuracy of {accuracy}"