From b111aaf133045362b9376895dea2f2164958ef59 Mon Sep 17 00:00:00 2001 From: Brad Miller Date: Thu, 12 Dec 2024 16:03:46 -0800 Subject: [PATCH] global expansion --- sourcecode/scoring/constants.py | 4 ++-- sourcecode/scoring/pandas_utils.py | 1 + sourcecode/scoring/run_scoring.py | 19 ++++++++++++++----- sourcecode/scoring/scoring_rules.py | 2 +- 4 files changed, 18 insertions(+), 8 deletions(-) diff --git a/sourcecode/scoring/constants.py b/sourcecode/scoring/constants.py index 46bd70aa..4daa1c97 100644 --- a/sourcecode/scoring/constants.py +++ b/sourcecode/scoring/constants.py @@ -78,8 +78,8 @@ # Scoring Groups coreGroups: Set[int] = {1, 2, 3, 6, 8, 9, 10, 11, 13, 14, 19, 21, 25} -expansionGroups: Set[int] = {0, 4, 5, 7, 12, 16, 18, 20, 22, 23, 24, 26, 27, 28} -expansionPlusGroups: Set[int] = {15, 17, 29, 30} +expansionGroups: Set[int] = {0, 4, 5, 7, 12, 15, 16, 18, 20, 22, 23, 26, 27, 28, 29} +expansionPlusGroups: Set[int] = {17, 24, 30, 31, 32} # TSV Values notHelpfulValueTsv = "NOT_HELPFUL" diff --git a/sourcecode/scoring/pandas_utils.py b/sourcecode/scoring/pandas_utils.py index 42129a7a..e2d5d109 100644 --- a/sourcecode/scoring/pandas_utils.py +++ b/sourcecode/scoring/pandas_utils.py @@ -657,6 +657,7 @@ def _inner(*args, **kwargs) -> Any: clArgs = kwargs["args"] else: # Handle the following, which expect args as the second positional argument: + # birdwatch/scoring/src/main/python/run_post_selection_similarity.py # birdwatch/scoring/src/main/python/run_prescoring.py # birdwatch/scoring/src/main/python/run_final_scoring.py # birdwatch/scoring/src/main/python/run_contributor_scoring.py diff --git a/sourcecode/scoring/run_scoring.py b/sourcecode/scoring/run_scoring.py index 4a31f823..e47a1fee 100644 --- a/sourcecode/scoring/run_scoring.py +++ b/sourcecode/scoring/run_scoring.py @@ -1039,11 +1039,21 @@ def _validate_contributor_scoring_output(helpfulnessScores: pd.DataFrame) -> pd. return helpfulnessScores +def run_post_selection_similarity(notes: pd.DataFrame, ratings: pd.DataFrame) -> pd.DataFrame: + with c.time_block("Compute Post Selection Similarity"): + pss = PostSelectionSimilarity(notes, ratings) + postSelectionSimilarityValues = pss.get_post_selection_similarity_values() + del pss + gc.collect() + return postSelectionSimilarityValues + + def run_prescoring( notes: pd.DataFrame, ratings: pd.DataFrame, noteStatusHistory: pd.DataFrame, userEnrollment: pd.DataFrame, + postSelectionSimilarityValues: pd.DataFrame, seed: Optional[int] = None, enabledScorers: Optional[Set[Scorers]] = None, runParallel: bool = True, @@ -1081,16 +1091,12 @@ def run_prescoring( logger.info( f"ratings summary before PSS: {get_df_fingerprint(ratings, [c.noteIdKey, c.raterParticipantIdKey])}" ) - with c.time_block("Compute Post Selection Similarity"): - pss = PostSelectionSimilarity(notes, ratings) - postSelectionSimilarityValues = pss.get_post_selection_similarity_values() + with c.time_block("Filter ratings by Post Selection Similarity"): logger.info(f"Post Selection Similarity Prescoring: begin with {len(ratings)} ratings.") ratings = filter_ratings_by_post_selection_similarity( notes, ratings, postSelectionSimilarityValues ) logger.info(f"Post Selection Similarity Prescoring: {len(ratings)} ratings remaining.") - del pss - gc.collect() logger.info( f"ratings summary after PSS: {get_df_fingerprint(ratings, [c.noteIdKey, c.raterParticipantIdKey])}" ) @@ -1868,6 +1874,8 @@ def run_scoring( filterPrescoringInputToSimulateDelayInHours, ) + postSelectionSimilarityValues = run_post_selection_similarity(notes=notes, ratings=ratings) + ( prescoringNoteModelOutput, prescoringRaterModelOutput, @@ -1880,6 +1888,7 @@ def run_scoring( ratings=prescoringRatingsInput, noteStatusHistory=noteStatusHistory, userEnrollment=userEnrollment, + postSelectionSimilarityValues=postSelectionSimilarityValues, seed=seed, enabledScorers=enabledScorers, runParallel=runParallel, diff --git a/sourcecode/scoring/scoring_rules.py b/sourcecode/scoring/scoring_rules.py index 7994cd2c..fcbd07cd 100644 --- a/sourcecode/scoring/scoring_rules.py +++ b/sourcecode/scoring/scoring_rules.py @@ -40,7 +40,7 @@ class RuleID(Enum): # Rules used in _meta_score. META_INITIAL_NMR = RuleAndVersion("MetaInitialNMR", "1.0", False) - EXPANSION_MODEL = RuleAndVersion("ExpansionModel", "1.1", False) + EXPANSION_MODEL = RuleAndVersion("ExpansionModel", "1.1", True) EXPANSION_PLUS_MODEL = RuleAndVersion("ExpansionPlusModel", "1.1", False) CORE_MODEL = RuleAndVersion("CoreModel", "1.1", True) COVERAGE_MODEL = RuleAndVersion("CoverageModel", "1.1", False)