Merge pull request #220 from twitter/jbaxter/2024_04_26

jbaxter · web-flow · commit adbc126a7da0 · 2024-04-26T15:33:00.000-07:00
Freeze rater parameters in final scoring, turn on status locking, parquet output + more column output
diff --git a/requirements.txt b/requirements.txt
@@ -3,3 +3,4 @@ pandas==2.1.4
 torch==2.1.2
 scipy==1.11.4
 scikit-learn>=1.3.0
+pyarrow
diff --git a/sourcecode/scoring/constants.py b/sourcecode/scoring/constants.py
@@ -134,10 +134,12 @@ def rater_factor_key(i):
 expansionRatingStatusKey = "expansionRatingStatus"
 expansionNoteInterceptMaxKey = "expansionNoteInterceptMax"
 expansionNoteInterceptMinKey = "expansionNoteInterceptMin"
+expansionInternalActiveRulesKey = "expansionActiveRules"
 # ExpansionPlus Model
 expansionPlusNoteInterceptKey = "expansionPlusNoteIntercept"
 expansionPlusNoteFactor1Key = "expansionPlusNoteFactor1"
 expansionPlusRatingStatusKey = "expansionPlusRatingStatus"
+expansionPlusInternalActiveRulesKey = "expansionPlusActiveRules"
 # Coverage / Helpfulness Reputation Model
 coverageNoteInterceptKey = "coverageNoteIntercept"
 coverageNoteFactor1Key = "coverageNoteFactor1"
@@ -153,11 +155,13 @@ def rater_factor_key(i):
 groupNoteInterceptMinKey = "groupNoteInterceptMin"
 groupRaterInterceptKey = "groupRaterIntercept"
 groupRaterFactor1Key = "groupRaterFactor1"
+groupInternalActiveRulesKey = "groupActiveRules"
 # Topic Model
 topicNoteInterceptKey = "topicNoteIntercept"
 topicNoteFactor1Key = "topicNoteFactor1"
 topicRatingStatusKey = "topicRatingStatus"
 topicNoteConfidentKey = "topicNoteConfident"
+topicInternalActiveRulesKey = "topicActiveRules"
 # Harassment/Abuse Tag
 harassmentNoteInterceptKey = "harassmentNoteIntercept"
 harassmentNoteFactor1Key = "harassmentNoteFactor1"
@@ -558,6 +562,10 @@ def rater_factor_key(i):
   (topicRatingStatusKey, str),
   (noteTopicKey, str),
   (topicNoteConfidentKey, str),
+  (expansionInternalActiveRulesKey, str),
+  (expansionPlusInternalActiveRulesKey, str),
+  (groupInternalActiveRulesKey, str),
+  (topicInternalActiveRulesKey, str),
 ]
 noteModelOutputTSVColumns = [col for (col, dtype) in noteModelOutputTSVColumnsAndTypes]
 noteModelOutputTSVTypeMapping = {col: dtype for (col, dtype) in noteModelOutputTSVColumnsAndTypes}
diff --git a/sourcecode/scoring/matrix_factorization/matrix_factorization.py b/sourcecode/scoring/matrix_factorization/matrix_factorization.py
@@ -443,6 +443,7 @@ def run_mf(
     globalInterceptInit: Optional[float] = None,
     specificNoteId: Optional[int] = None,
     validatePercent: Optional[float] = None,
+    freezeRaterParameters: bool = False,
   ):
     """Train matrix factorization model.
 
@@ -466,6 +467,8 @@ def run_mf(
     self._create_mf_model(noteInit, userInit, globalInterceptInit)
     assert self.mf_model is not None
 
+    if freezeRaterParameters:
+      self.mf_model._freeze_parameters(set({"user"}))
     if specificNoteId is not None:
       self.mf_model.freeze_rater_and_global_parameters()
     self.prepare_features_and_labels(specificNoteId)
diff --git a/sourcecode/scoring/mf_base_scorer.py b/sourcecode/scoring/mf_base_scorer.py
@@ -570,6 +570,18 @@ def _prescore_notes_and_users(
       if self._saveIntermediateState:
         self.helpfulnessScores = helpfulnessScores
 
+      ## One extra final round!
+      # Filter ratings based on prev helpfulness scores
+      finalRoundRatings = helpfulness_scores.filter_ratings_by_helpfulness_scores(
+        ratingsForTraining, helpfulnessScores
+      )
+      # Run MF
+      noteParamsUnfiltered, raterParamsUnfiltered, globalBias = self._mfRanker.run_mf(
+        ratings=finalRoundRatings,
+        noteInit=noteParamsUnfiltered,
+        userInit=raterParamsUnfiltered,
+      )
+
     raterModelOutput = raterParamsUnfiltered.merge(
       helpfulnessScores[
         [
@@ -644,6 +656,8 @@ def _score_notes_and_users(
         ratings=finalRoundRatings,
         noteInit=prescoringNoteModelOutput,
         userInit=prescoringRaterModelOutput,
+        globalInterceptInit=0.17,
+        freezeRaterParameters=True,
       )
 
     if self._saveIntermediateState:
diff --git a/sourcecode/scoring/mf_expansion_plus_scorer.py b/sourcecode/scoring/mf_expansion_plus_scorer.py
@@ -37,6 +37,7 @@ def _get_note_col_mapping(self) -> Dict[str, str]:
       c.internalNoteInterceptKey: c.expansionPlusNoteInterceptKey,
       c.internalNoteFactor1Key: c.expansionPlusNoteFactor1Key,
       c.internalRatingStatusKey: c.expansionPlusRatingStatusKey,
+      c.internalActiveRulesKey: c.expansionPlusInternalActiveRulesKey,
     }
 
   def get_scored_notes_cols(self) -> List[str]:
@@ -46,6 +47,7 @@ def get_scored_notes_cols(self) -> List[str]:
       c.expansionPlusNoteInterceptKey,
       c.expansionPlusNoteFactor1Key,
       c.expansionPlusRatingStatusKey,
+      c.expansionPlusInternalActiveRulesKey,
     ]
 
   def get_helpfulness_scores_cols(self) -> List[str]:
@@ -60,7 +62,6 @@ def _get_dropped_note_cols(self) -> List[str]:
     """Returns a list of columns which should be excluded from scoredNotes and auxiliaryNoteInfo."""
     return super()._get_dropped_note_cols() + (
       [
-        c.internalActiveRulesKey,
         c.activeFilterTagsKey,
         c.ratingWeightKey,
         c.noteInterceptMinKey,
diff --git a/sourcecode/scoring/mf_expansion_scorer.py b/sourcecode/scoring/mf_expansion_scorer.py
@@ -43,6 +43,7 @@ def _get_note_col_mapping(self) -> Dict[str, str]:
       c.internalRatingStatusKey: c.expansionRatingStatusKey,
       c.noteInterceptMinKey: c.expansionNoteInterceptMinKey,
       c.noteInterceptMaxKey: c.expansionNoteInterceptMaxKey,
+      c.internalActiveRulesKey: c.expansionInternalActiveRulesKey,
     }
 
   def get_scored_notes_cols(self) -> List[str]:
@@ -54,6 +55,7 @@ def get_scored_notes_cols(self) -> List[str]:
       c.expansionRatingStatusKey,
       c.expansionNoteInterceptMinKey,
       c.expansionNoteInterceptMaxKey,
+      c.expansionInternalActiveRulesKey,
     ]
 
   def get_helpfulness_scores_cols(self) -> List[str]:
@@ -68,7 +70,6 @@ def _get_dropped_note_cols(self) -> List[str]:
     """Returns a list of columns which should be excluded from scoredNotes and auxiliaryNoteInfo."""
     return super()._get_dropped_note_cols() + (
       [
-        c.internalActiveRulesKey,
         c.activeFilterTagsKey,
         c.ratingWeightKey,
       ]
diff --git a/sourcecode/scoring/mf_group_scorer.py b/sourcecode/scoring/mf_group_scorer.py
@@ -44,6 +44,7 @@ def coalesce_group_models(
     c.groupNoteInterceptMaxKey,
     c.groupNoteInterceptMinKey,
     c.modelingGroupKey,
+    c.groupInternalActiveRulesKey,
   ]:
     scoredNotes = coalesce_columns(scoredNotes, col)
 
@@ -135,6 +136,7 @@ def __init__(
     self._groupRatingStatusKey = f"{c.groupRatingStatusKey}_{self._groupNumber}"
     self._groupNoteInterceptMaxKey = f"{c.groupNoteInterceptMaxKey}_{self._groupNumber}"
     self._groupNoteInterceptMinKey = f"{c.groupNoteInterceptMinKey}_{self._groupNumber}"
+    self._groupInternalActiveRulesKey = f"{c.groupInternalActiveRulesKey}_{self._groupNumber}"
     self._groupRaterInterceptKey = f"{c.groupRaterInterceptKey}_{self._groupNumber}"
     self._groupRaterFactor1Key = f"{c.groupRaterFactor1Key}_{self._groupNumber}"
     self._modelingGroupKey = f"{c.modelingGroupKey}_{self._groupNumber}"
@@ -151,6 +153,7 @@ def _get_note_col_mapping(self) -> Dict[str, str]:
       c.internalRatingStatusKey: self._groupRatingStatusKey,
       c.noteInterceptMinKey: self._groupNoteInterceptMinKey,
       c.noteInterceptMaxKey: self._groupNoteInterceptMaxKey,
+      c.internalActiveRulesKey: self._groupInternalActiveRulesKey,
     }
 
   def _get_user_col_mapping(self) -> Dict[str, str]:
@@ -169,6 +172,7 @@ def get_scored_notes_cols(self) -> List[str]:
       self._groupRatingStatusKey,
       self._groupNoteInterceptMaxKey,
       self._groupNoteInterceptMinKey,
+      self._groupInternalActiveRulesKey,
       self._modelingGroupKey,
     ]
 
@@ -189,7 +193,6 @@ def _get_dropped_note_cols(self) -> List[str]:
     """Returns a list of columns which should be excluded from scoredNotes and auxiliaryNoteInfo."""
     return super()._get_dropped_note_cols() + (
       [
-        c.internalActiveRulesKey,
         c.activeFilterTagsKey,
         c.ratingWeightKey,
       ]
diff --git a/sourcecode/scoring/mf_topic_scorer.py b/sourcecode/scoring/mf_topic_scorer.py
@@ -26,6 +26,7 @@ def coalesce_topic_models(scoredNotes: pd.DataFrame) -> pd.DataFrame:
     c.topicRatingStatusKey,
     c.topicNoteConfidentKey,
     c.noteTopicKey,
+    c.topicInternalActiveRulesKey,
   ]:
     scoredNotes = coalesce_columns(scoredNotes, col)
 
@@ -106,6 +107,7 @@ def __init__(
     self._topicNoteInterceptKey = f"{c.topicNoteInterceptKey}_{self._topicName}"
     self._topicNoteFactor1Key = f"{c.topicNoteFactor1Key}_{self._topicName}"
     self._topicRatingStatusKey = f"{c.topicRatingStatusKey}_{self._topicName}"
+    self._topicInternalActiveRulesKey = f"{c.topicInternalActiveRulesKey}_{self._topicName}"
     self._noteTopicKey = f"{c.noteTopicKey}_{self._topicName}"
     self._noteTopicConfidentKey = f"{c.topicNoteConfidentKey}_{self._topicName}"
 
@@ -118,6 +120,7 @@ def _get_note_col_mapping(self) -> Dict[str, str]:
       c.internalNoteInterceptKey: self._topicNoteInterceptKey,
       c.internalNoteFactor1Key: self._topicNoteFactor1Key,
       c.internalRatingStatusKey: self._topicRatingStatusKey,
+      c.internalActiveRulesKey: self._topicInternalActiveRulesKey,
     }
 
   def get_scored_notes_cols(self) -> List[str]:
@@ -129,6 +132,7 @@ def get_scored_notes_cols(self) -> List[str]:
       self._topicRatingStatusKey,
       self._noteTopicKey,
       self._noteTopicConfidentKey,
+      self._topicInternalActiveRulesKey,
     ]
 
   def get_helpfulness_scores_cols(self) -> List[str]:
@@ -143,7 +147,6 @@ def _get_dropped_note_cols(self) -> List[str]:
     """Returns a list of columns which should be excluded from scoredNotes and auxiliaryNoteInfo."""
     return super()._get_dropped_note_cols() + (
       [
-        c.internalActiveRulesKey,
         c.activeFilterTagsKey,
         c.ratingWeightKey,
         c.noteInterceptMinKey,
diff --git a/sourcecode/scoring/process_data.py b/sourcecode/scoring/process_data.py
@@ -421,15 +421,29 @@ def write_tsv_local(df: pd.DataFrame, path: str) -> None:
   Args:
     df: pd.DataFrame to write to disk.
     path: location of file on disk.
-
-  Returns:
-    None, because path is always None.
   """
 
   assert path is not None
   assert df.to_csv(path, index=False, header=True, sep="\t") is None
 
 
+def write_parquet_local(
+  df: pd.DataFrame, path: str, compression: str = "snappy", engine: str = "pyarrow"
+) -> None:
+  """Write DF as a parquet file stored to local disk. Compress with snappy
+  and use pyarrow engine.
+
+  Args:
+    df: pd.DataFrame to write to disk.
+    path: location of file on disk.
+    compression: compression algorithm to use. Defaults to 'snappy'.
+    engine: engine to use. Defaults to 'pyarrow'.
+  """
+
+  assert path is not None
+  df.to_parquet(path, compression=compression, engine=engine)
+
+
 class CommunityNotesDataLoader(ABC):
   """Base class which local and prod data loaders extend.
 
diff --git a/sourcecode/scoring/run_scoring.py b/sourcecode/scoring/run_scoring.py
@@ -254,7 +254,9 @@ def _run_scorer_parallelizable(
         scoringArgs = _load_data_from_shared_memory_parallelizable(
           scoringArgsSharedMemory, scoringArgs
         )
-        print(f"{scorer.get_name()} run_scorer_parallelizable just finished loading data from shared memory.")
+        print(
+          f"{scorer.get_name()} run_scorer_parallelizable just finished loading data from shared memory."
+        )
       elif dataLoader is not None:
         print(
           f"{scorer.get_name()} run_scorer_parallelizable just started in parallel: loading data with dataLoader."
@@ -522,19 +524,25 @@ def meta_score(
       # MFExpansionPlusScorer will have the lowest priority.
       rules.append(
         scoring_rules.ApplyModelResult(
-          RuleID.EXPANSION_PLUS_MODEL, {RuleID.META_INITIAL_NMR}, c.expansionPlusRatingStatusKey
+          RuleID.EXPANSION_PLUS_MODEL,
+          {RuleID.META_INITIAL_NMR},
+          c.expansionPlusRatingStatusKey,
         )
       )
     if enabledScorers is None or Scorers.MFExpansionScorer in enabledScorers:
       rules.append(
         scoring_rules.ApplyModelResult(
-          RuleID.EXPANSION_MODEL, {RuleID.META_INITIAL_NMR}, c.expansionRatingStatusKey
+          RuleID.EXPANSION_MODEL,
+          {RuleID.META_INITIAL_NMR},
+          c.expansionRatingStatusKey,
         )
       )
     if enabledScorers is None or Scorers.MFCoreScorer in enabledScorers:
       rules.append(
         scoring_rules.ApplyModelResult(
-          RuleID.CORE_MODEL, {RuleID.META_INITIAL_NMR}, c.coreRatingStatusKey
+          RuleID.CORE_MODEL,
+          {RuleID.META_INITIAL_NMR},
+          c.coreRatingStatusKey,
         )
       )
     if enabledScorers is None or Scorers.MFGroupScorer in enabledScorers:
diff --git a/sourcecode/scoring/runner.py b/sourcecode/scoring/runner.py
@@ -3,7 +3,12 @@
 
 from . import constants as c
 from .enums import scorers_from_csv
-from .process_data import LocalDataLoader, write_prescoring_output, write_tsv_local
+from .process_data import (
+  LocalDataLoader,
+  write_parquet_local,
+  write_prescoring_output,
+  write_tsv_local,
+)
 from .run_scoring import run_scoring
 
 
@@ -84,6 +89,13 @@ def parse_args():
     dest="prescoring_delay_hours",
     help="Filter prescoring input to simulate delay in hours",
   )
+  parser.add_argument(
+    "--no-parquet",
+    help="Disable writing parquet files.",
+    default=False,
+    action="store_true",
+    dest="no_parquet",
+  )
 
   return parser.parse_args()
 
@@ -138,6 +150,12 @@ def prescoring_write_fn(notePath, raterPath):
   write_tsv_local(newStatus, os.path.join(args.outdir, "note_status_history.tsv"))
   write_tsv_local(auxNoteInfo, os.path.join(args.outdir, "aux_note_info.tsv"))
 
+  if not args.no_parquet:
+    write_parquet_local(scoredNotes, os.path.join(args.outdir, "scored_notes.parquet"))
+    write_parquet_local(helpfulnessScores, os.path.join(args.outdir, "helpfulness_scores.parquet"))
+    write_parquet_local(newStatus, os.path.join(args.outdir, "note_status_history.parquet"))
+    write_parquet_local(auxNoteInfo, os.path.join(args.outdir, "aux_note_info.parquet"))
+
 
 if __name__ == "__main__":
   main()
diff --git a/sourcecode/scoring/scoring_rules.py b/sourcecode/scoring/scoring_rules.py

Original file line number	Diff line number	Diff line change
`@@ -43,6 +43,7 @@ def _get_note_col_mapping(self) -> Dict[str, str]:`
`43`	`43`	`c.internalRatingStatusKey: c.expansionRatingStatusKey,`
`44`	`44`	`c.noteInterceptMinKey: c.expansionNoteInterceptMinKey,`
`45`	`45`	`c.noteInterceptMaxKey: c.expansionNoteInterceptMaxKey,`
	`46`	`+ c.internalActiveRulesKey: c.expansionInternalActiveRulesKey,`
`46`	`47`	`}`
`47`	`48`
`48`	`49`	`def get_scored_notes_cols(self) -> List[str]:`
`@@ -54,6 +55,7 @@ def get_scored_notes_cols(self) -> List[str]:`
`54`	`55`	`c.expansionRatingStatusKey,`
`55`	`56`	`c.expansionNoteInterceptMinKey,`
`56`	`57`	`c.expansionNoteInterceptMaxKey,`
	`58`	`+ c.expansionInternalActiveRulesKey,`
`57`	`59`	`]`
`58`	`60`
`59`	`61`	`def get_helpfulness_scores_cols(self) -> List[str]:`
`@@ -68,7 +70,6 @@ def _get_dropped_note_cols(self) -> List[str]:`
`68`	`70`	`"""Returns a list of columns which should be excluded from scoredNotes and auxiliaryNoteInfo."""`
`69`	`71`	`return super()._get_dropped_note_cols() + (`
`70`	`72`	`[`
`71`		`- c.internalActiveRulesKey,`
`72`	`73`	`c.activeFilterTagsKey,`
`73`	`74`	`c.ratingWeightKey,`
`74`	`75`	`]`
Original file line number	Diff line number	Diff line change
`@@ -254,7 +254,9 @@ def _run_scorer_parallelizable(`
`254`	`254`	`scoringArgs = _load_data_from_shared_memory_parallelizable(`
`255`	`255`	`scoringArgsSharedMemory, scoringArgs`
`256`	`256`	`)`
`257`		`- print(f"{scorer.get_name()} run_scorer_parallelizable just finished loading data from shared memory.")`
	`257`	`+ print(`
	`258`	`+ f"{scorer.get_name()} run_scorer_parallelizable just finished loading data from shared memory."`
	`259`	`+ )`
`258`	`260`	`elif dataLoader is not None:`
`259`	`261`	`print(`
`260`	`262`	`f"{scorer.get_name()} run_scorer_parallelizable just started in parallel: loading data with dataLoader."`
`@@ -522,19 +524,25 @@ def meta_score(`
`522`	`524`	`# MFExpansionPlusScorer will have the lowest priority.`
`523`	`525`	`rules.append(`
`524`	`526`	`scoring_rules.ApplyModelResult(`
`525`		`- RuleID.EXPANSION_PLUS_MODEL, {RuleID.META_INITIAL_NMR}, c.expansionPlusRatingStatusKey`
	`527`	`+ RuleID.EXPANSION_PLUS_MODEL,`
	`528`	`+ {RuleID.META_INITIAL_NMR},`
	`529`	`+ c.expansionPlusRatingStatusKey,`
`526`	`530`	`)`
`527`	`531`	`)`
`528`	`532`	`if enabledScorers is None or Scorers.MFExpansionScorer in enabledScorers:`
`529`	`533`	`rules.append(`
`530`	`534`	`scoring_rules.ApplyModelResult(`
`531`		`- RuleID.EXPANSION_MODEL, {RuleID.META_INITIAL_NMR}, c.expansionRatingStatusKey`
	`535`	`+ RuleID.EXPANSION_MODEL,`
	`536`	`+ {RuleID.META_INITIAL_NMR},`
	`537`	`+ c.expansionRatingStatusKey,`
`532`	`538`	`)`
`533`	`539`	`)`
`534`	`540`	`if enabledScorers is None or Scorers.MFCoreScorer in enabledScorers:`
`535`	`541`	`rules.append(`
`536`	`542`	`scoring_rules.ApplyModelResult(`
`537`		`- RuleID.CORE_MODEL, {RuleID.META_INITIAL_NMR}, c.coreRatingStatusKey`
	`543`	`+ RuleID.CORE_MODEL,`
	`544`	`+ {RuleID.META_INITIAL_NMR},`
	`545`	`+ c.coreRatingStatusKey,`
`538`	`546`	`)`
`539`	`547`	`)`
`540`	`548`	`if enabledScorers is None or Scorers.MFGroupScorer in enabledScorers:`