From b111aaf133045362b9376895dea2f2164958ef59 Mon Sep 17 00:00:00 2001
From: Brad Miller <bradm@twitter.com>
Date: Thu, 12 Dec 2024 16:03:46 -0800
Subject: [PATCH] global expansion

---
 sourcecode/scoring/constants.py     |  4 ++--
 sourcecode/scoring/pandas_utils.py  |  1 +
 sourcecode/scoring/run_scoring.py   | 19 ++++++++++++++-----
 sourcecode/scoring/scoring_rules.py |  2 +-
 4 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/sourcecode/scoring/constants.py b/sourcecode/scoring/constants.py
index 46bd70aa..4daa1c97 100644
--- a/sourcecode/scoring/constants.py
+++ b/sourcecode/scoring/constants.py
@@ -78,8 +78,8 @@
 
 # Scoring Groups
 coreGroups: Set[int] = {1, 2, 3, 6, 8, 9, 10, 11, 13, 14, 19, 21, 25}
-expansionGroups: Set[int] = {0, 4, 5, 7, 12, 16, 18, 20, 22, 23, 24, 26, 27, 28}
-expansionPlusGroups: Set[int] = {15, 17, 29, 30}
+expansionGroups: Set[int] = {0, 4, 5, 7, 12, 15, 16, 18, 20, 22, 23, 26, 27, 28, 29}
+expansionPlusGroups: Set[int] = {17, 24, 30, 31, 32}
 
 # TSV Values
 notHelpfulValueTsv = "NOT_HELPFUL"
diff --git a/sourcecode/scoring/pandas_utils.py b/sourcecode/scoring/pandas_utils.py
index 42129a7a..e2d5d109 100644
--- a/sourcecode/scoring/pandas_utils.py
+++ b/sourcecode/scoring/pandas_utils.py
@@ -657,6 +657,7 @@ def _inner(*args, **kwargs) -> Any:
       clArgs = kwargs["args"]
     else:
       # Handle the following, which expect args as the second positional argument:
+      # birdwatch/scoring/src/main/python/run_post_selection_similarity.py
       # birdwatch/scoring/src/main/python/run_prescoring.py
       # birdwatch/scoring/src/main/python/run_final_scoring.py
       # birdwatch/scoring/src/main/python/run_contributor_scoring.py
diff --git a/sourcecode/scoring/run_scoring.py b/sourcecode/scoring/run_scoring.py
index 4a31f823..e47a1fee 100644
--- a/sourcecode/scoring/run_scoring.py
+++ b/sourcecode/scoring/run_scoring.py
@@ -1039,11 +1039,21 @@ def _validate_contributor_scoring_output(helpfulnessScores: pd.DataFrame) -> pd.
   return helpfulnessScores
 
 
+def run_post_selection_similarity(notes: pd.DataFrame, ratings: pd.DataFrame) -> pd.DataFrame:
+  with c.time_block("Compute Post Selection Similarity"):
+    pss = PostSelectionSimilarity(notes, ratings)
+    postSelectionSimilarityValues = pss.get_post_selection_similarity_values()
+    del pss
+    gc.collect()
+  return postSelectionSimilarityValues
+
+
 def run_prescoring(
   notes: pd.DataFrame,
   ratings: pd.DataFrame,
   noteStatusHistory: pd.DataFrame,
   userEnrollment: pd.DataFrame,
+  postSelectionSimilarityValues: pd.DataFrame,
   seed: Optional[int] = None,
   enabledScorers: Optional[Set[Scorers]] = None,
   runParallel: bool = True,
@@ -1081,16 +1091,12 @@ def run_prescoring(
   logger.info(
     f"ratings summary before PSS: {get_df_fingerprint(ratings, [c.noteIdKey, c.raterParticipantIdKey])}"
   )
-  with c.time_block("Compute Post Selection Similarity"):
-    pss = PostSelectionSimilarity(notes, ratings)
-    postSelectionSimilarityValues = pss.get_post_selection_similarity_values()
+  with c.time_block("Filter ratings by Post Selection Similarity"):
     logger.info(f"Post Selection Similarity Prescoring: begin with {len(ratings)} ratings.")
     ratings = filter_ratings_by_post_selection_similarity(
       notes, ratings, postSelectionSimilarityValues
     )
     logger.info(f"Post Selection Similarity Prescoring: {len(ratings)} ratings remaining.")
-    del pss
-    gc.collect()
   logger.info(
     f"ratings summary after PSS: {get_df_fingerprint(ratings, [c.noteIdKey, c.raterParticipantIdKey])}"
   )
@@ -1868,6 +1874,8 @@ def run_scoring(
     filterPrescoringInputToSimulateDelayInHours,
   )
 
+  postSelectionSimilarityValues = run_post_selection_similarity(notes=notes, ratings=ratings)
+
   (
     prescoringNoteModelOutput,
     prescoringRaterModelOutput,
@@ -1880,6 +1888,7 @@ def run_scoring(
     ratings=prescoringRatingsInput,
     noteStatusHistory=noteStatusHistory,
     userEnrollment=userEnrollment,
+    postSelectionSimilarityValues=postSelectionSimilarityValues,
     seed=seed,
     enabledScorers=enabledScorers,
     runParallel=runParallel,
diff --git a/sourcecode/scoring/scoring_rules.py b/sourcecode/scoring/scoring_rules.py
index 7994cd2c..fcbd07cd 100644
--- a/sourcecode/scoring/scoring_rules.py
+++ b/sourcecode/scoring/scoring_rules.py
@@ -40,7 +40,7 @@ class RuleID(Enum):
 
   # Rules used in _meta_score.
   META_INITIAL_NMR = RuleAndVersion("MetaInitialNMR", "1.0", False)
-  EXPANSION_MODEL = RuleAndVersion("ExpansionModel", "1.1", False)
+  EXPANSION_MODEL = RuleAndVersion("ExpansionModel", "1.1", True)
   EXPANSION_PLUS_MODEL = RuleAndVersion("ExpansionPlusModel", "1.1", False)
   CORE_MODEL = RuleAndVersion("CoreModel", "1.1", True)
   COVERAGE_MODEL = RuleAndVersion("CoverageModel", "1.1", False)