twitter
diff --git a/Diff for: ‎sourcecode/scoring/constants.py
+77-36 b/Diff for: ‎sourcecode/scoring/constants.py
+77-36
diff --git a/Diff for: ‎sourcecode/scoring/enums.py
+1 b/Diff for: ‎sourcecode/scoring/enums.py
+1
diff --git a/Diff for: ‎sourcecode/scoring/mf_group_scorer.py
+28-30 b/Diff for: ‎sourcecode/scoring/mf_group_scorer.py
+28-30
@@ -3,7 +3,7 @@
 from enum import Enum
 import os
 import time
-from typing import Dict, Optional
+from typing import Dict, Optional, Set
 
 import numpy as np
 import pandas as pd
@@ -34,12 +34,17 @@
 intervalHalfWidth = 0.3
 
 # Max flip rates
-prescoringAllUnlockedNotesMaxCrhChurn = 0.04
-finalUnlockedNotesWithNoNewRatingsMaxCrhChurn = 0.03
+prescoringAllUnlockedNotesMaxCrhChurn = 0.2
+prescoringAllNotesCreatedThreeToThirteenDaysAgoMaxChurn = 0.06
+finalUnlockedNotesWithNoNewRatingsMaxCrhChurn = 0.05
 finalNotesWithNewRatingsMaxNewCrhChurn = 0.80
 finalNotesWithNewRatingsMaxOldCrhChurn = 0.25
 finalNotesThatJustFlippedStatusMaxCrhChurn = 1e8
 finalNotesThatFlippedRecentlyMaxCrhChurn = 1e8
+# TODO(jiansongc): adjust these 2 below
+finalNotesNmrDueToMinStableCrhTimeMaxOldCrhChurn = 1.0
+finalNotesNmrDueToMinStableCrhTimeMaxNewCrhChurn = 1.0
+
 
 # Data Filenames
 scoredNotesOutputPath = "scoredNotes.tsv"
@@ -59,17 +64,14 @@
 authorTopNotHelpfulTagValues = "authorTopNotHelpfulTagValues"
 modelingPopulationKey = "modelingPopulation"
 modelingGroupKey = "modelingGroup"
+modelingMultiGroupKey = "modelingMultiGroup"
 numberOfTimesEarnedOutKey = "numberOfTimesEarnedOut"
 defaultIndexKey = "index"
 
 # Scoring Groups
-coreGroups = {1, 2, 3, 6, 8, 9, 10, 11, 13, 14, 19, 21, 25}
-expansionGroups = (
-  # Divide into 3 grouping aggregates to prepare for multi-group models,
-  # and a 4th group containing leftovers
-  {0, 15, 17, 24, 29, 30} | {4, 5, 7, 12, 26} | {27} | {16, 20, 22, 23, 28}
-)
-expansionPlusGroups = {18}
+coreGroups: Set[int] = {1, 2, 3, 6, 8, 9, 10, 11, 13, 14, 19, 21, 25}
+expansionGroups: Set[int] = {0, 4, 5, 7, 12, 16, 18, 20, 22, 23, 24, 26, 27, 28}
+expansionPlusGroups: Set[int] = {15, 17, 29, 30}
 
 # TSV Values
 notHelpfulValueTsv = "NOT_HELPFUL"
@@ -193,6 +195,14 @@ def rater_factor_key(i):
 groupRaterFactor1Key = "groupRaterFactor1"
 groupInternalActiveRulesKey = "groupActiveRules"
 groupNumFinalRoundRatingsKey = "groupNumFinalRoundRatings"
+# MultiGroup Model
+multiGroupNoteInterceptKey = "multiGroupNoteIntercept"
+multiGroupNoteFactor1Key = "multiGroupNoteFactor1"
+multiGroupRatingStatusKey = "multiGroupRatingStatus"
+multiGroupRaterInterceptKey = "multiGroupRaterIntercept"
+multiGroupRaterFactor1Key = "multiGroupRaterFactor1"
+multiGroupInternalActiveRulesKey = "multiGroupActiveRules"
+multiGroupNumFinalRoundRatingsKey = "multiGroupNumFinalRoundRatings"
 # Topic Model
 topicNoteInterceptKey = "topicNoteIntercept"
 topicNoteFactor1Key = "topicNoteFactor1"
@@ -445,6 +455,12 @@ def rater_factor_key(i):
 currentDecidedByKey = "currentDecidedBy"
 currentModelingGroupKey = "currentModelingGroup"
 timestampMillisOfMostRecentStatusChangeKey = "timestampMillisOfMostRecentStatusChange"
+currentMultiGroupStatusKey = "currentMultiGroupStatus"
+currentModelingMultiGroupKey = "currentModelingMultiGroup"
+timestampMillisOfNmrDueToMinStableCrhTimeKey = "timestampMillisOfNmrDueToMinStableCrhTime"
+updatedTimestampMillisOfNmrDueToMinStableCrhTimeKey = (
+  "updatedTimestampMillisOfNmrDueToMinStableCrhTime"
+)
 
 noteStatusHistoryTSVColumnsAndTypes = [
   (noteIdKey, np.int64),
@@ -465,12 +481,22 @@ def rater_factor_key(i):
   (currentDecidedByKey, "category"),
   (currentModelingGroupKey, np.double),  # TODO: int
   (timestampMillisOfMostRecentStatusChangeKey, np.double),  # double because nullable.
+  (timestampMillisOfNmrDueToMinStableCrhTimeKey, np.double),  # double because nullable.
+  (currentMultiGroupStatusKey, "category"),
+  (currentModelingMultiGroupKey, np.double),  # TODO: int
 ]
 noteStatusHistoryTSVColumns = [col for (col, dtype) in noteStatusHistoryTSVColumnsAndTypes]
 noteStatusHistoryTSVTypes = [dtype for (col, dtype) in noteStatusHistoryTSVColumnsAndTypes]
 noteStatusHistoryTSVTypeMapping = {
   col: dtype for (col, dtype) in noteStatusHistoryTSVColumnsAndTypes
 }
+# TODO(jiansongc): clean up after new column is in production.
+noteStatusHistoryTSVColumnsOld = noteStatusHistoryTSVColumns[:-1]
+noteStatusHistoryTSVColumnsAndTypesOld = noteStatusHistoryTSVColumnsAndTypes[:-1]
+noteStatusHistoryTSVTypeMappingOld = {
+  col: dtype for (col, dtype) in noteStatusHistoryTSVColumnsAndTypesOld
+}
+
 
 # Earn In + Earn Out
 enrollmentState = "enrollmentState"
@@ -587,6 +613,8 @@ def rater_factor_key(i):
   {
     coverageNoteInterceptMinKey,
     coverageNoteInterceptMaxKey,
+    groupNoteInterceptMinKey,
+    groupNoteInterceptMaxKey,
   }
 )
 
@@ -610,58 +638,64 @@ def rater_factor_key(i):
   (noteIdKey, np.int64),
   (coreNoteInterceptKey, np.double),
   (coreNoteFactor1Key, np.double),
-  (finalRatingStatusKey, str),
-  (firstTagKey, str),
-  (secondTagKey, str),
+  (finalRatingStatusKey, "category"),
+  (firstTagKey, "category"),
+  (secondTagKey, "category"),
   # Note that this column was formerly named "activeRules" and the name is now
   # updated to "coreActiveRules".  The data values remain the compatible,
   # but the new column only contains rules that ran when deciding status based on
   # the core model.
-  (coreActiveRulesKey, str),
-  (activeFilterTagsKey, str),
-  (classificationKey, str),
+  (coreActiveRulesKey, "category"),
+  (activeFilterTagsKey, "category"),
+  (classificationKey, "category"),
   (createdAtMillisKey, np.int64),
-  (coreRatingStatusKey, str),
-  (metaScorerActiveRulesKey, str),
-  (decidedByKey, str),
+  (coreRatingStatusKey, "category"),
+  (metaScorerActiveRulesKey, "category"),
+  (decidedByKey, "category"),
   (expansionNoteInterceptKey, np.double),
   (expansionNoteFactor1Key, np.double),
-  (expansionRatingStatusKey, str),
+  (expansionRatingStatusKey, "category"),
   (coverageNoteInterceptKey, np.double),
   (coverageNoteFactor1Key, np.double),
-  (coverageRatingStatusKey, str),
+  (coverageRatingStatusKey, "category"),
   (coreNoteInterceptMinKey, np.double),
   (coreNoteInterceptMaxKey, np.double),
-  (expansionNoteInterceptMinKey, np.double),
-  (expansionNoteInterceptMaxKey, np.double),
-  (coverageNoteInterceptMinKey, np.double),
-  (coverageNoteInterceptMaxKey, np.double),
+  (expansionNoteInterceptMinKey, "category"),  # category because always nan
+  (expansionNoteInterceptMaxKey, "category"),  # category because always nan
+  (coverageNoteInterceptMinKey, "category"),  # category because always nan
+  (coverageNoteInterceptMaxKey, "category"),  # category because always nan
   (groupNoteInterceptKey, np.double),
   (groupNoteFactor1Key, np.double),
-  (groupRatingStatusKey, str),
-  (groupNoteInterceptMaxKey, np.double),
-  (groupNoteInterceptMinKey, np.double),
+  (groupRatingStatusKey, "category"),
+  (groupNoteInterceptMaxKey, "category"),  # category because always nan
+  (groupNoteInterceptMinKey, "category"),  # category because always nan
   (modelingGroupKey, np.float64),
   (numRatingsKey, np.int64),
   (timestampMillisOfNoteCurrentLabelKey, np.double),
   (expansionPlusNoteInterceptKey, np.double),
   (expansionPlusNoteFactor1Key, np.double),
-  (expansionPlusRatingStatusKey, str),
+  (expansionPlusRatingStatusKey, "category"),
   (topicNoteInterceptKey, np.double),
   (topicNoteFactor1Key, np.double),
-  (topicRatingStatusKey, str),
-  (noteTopicKey, str),
+  (topicRatingStatusKey, "category"),
+  (noteTopicKey, "category"),
   (topicNoteConfidentKey, pd.BooleanDtype()),
-  (expansionInternalActiveRulesKey, str),
-  (expansionPlusInternalActiveRulesKey, str),
-  (groupInternalActiveRulesKey, str),
-  (topicInternalActiveRulesKey, str),
+  (expansionInternalActiveRulesKey, "category"),
+  (expansionPlusInternalActiveRulesKey, "category"),
+  (groupInternalActiveRulesKey, "category"),
+  (topicInternalActiveRulesKey, "category"),
   (coreNumFinalRoundRatingsKey, np.double),  # double because nullable.
   (expansionNumFinalRoundRatingsKey, np.double),  # double because nullable.
   (expansionPlusNumFinalRoundRatingsKey, np.double),  # double because nullable.
   (groupNumFinalRoundRatingsKey, np.double),  # double because nullable.
   (topicNumFinalRoundRatingsKey, np.double),  # double because nullable.
-  (rescoringActiveRulesKey, str),
+  (rescoringActiveRulesKey, "category"),
+  (multiGroupNoteInterceptKey, np.double),
+  (multiGroupNoteFactor1Key, np.double),
+  (multiGroupRatingStatusKey, str),
+  (modelingMultiGroupKey, np.float64),
+  (multiGroupInternalActiveRulesKey, str),
+  (multiGroupNumFinalRoundRatingsKey, np.double),  # double because nullable.
 ]
 noteModelOutputTSVColumns = [col for (col, dtype) in noteModelOutputTSVColumnsAndTypes]
 noteModelOutputTSVTypeMapping = {col: dtype for (col, dtype) in noteModelOutputTSVColumnsAndTypes}
@@ -733,6 +767,9 @@ def rater_factor_key(i):
   (expansionRaterFactor1Key, np.double),
   (expansionPlusRaterInterceptKey, np.double),
   (expansionPlusRaterFactor1Key, np.double),
+  (multiGroupRaterInterceptKey, np.double),
+  (multiGroupRaterFactor1Key, np.double),
+  (modelingMultiGroupKey, np.float64),
 ]
 raterModelOutputTSVColumns = [col for (col, dtype) in raterModelOutputTSVColumnsAndTypes]
 raterModelOutputTSVTypeMapping = {col: dtype for (col, dtype) in raterModelOutputTSVColumnsAndTypes}
@@ -781,6 +818,8 @@ def rater_factor_key(i):
 inputPathsTSVColumns = [col for (col, _) in inputPathsTSVColumnsAndTypes]
 inputPathsTSVTypeMapping = {col: dtype for (col, dtype) in inputPathsTSVColumnsAndTypes}
 
+timestampMinuteOfFinalScoringOutput = "timestampMinuteOfFinalScoringOutput"
+
 
 @contextmanager
 def time_block(label):
@@ -888,6 +927,8 @@ class RescoringRuleID(Enum):
   NOTES_FLIPPED_PREVIOUS_RUN = 3
   NEW_NOTES_NOT_RESCORED_RECENTLY_ENOUGH = 4
   RECENTLY_FLIPPED_NOTES_NOT_RESCORED_RECENTLY_ENOUGH = 5
+  NMR_DUE_TO_MIN_STABLE_CRH_TIME = 6
+  NOTES_CREATED_SOMEWHAT_RECENTLY = 7
 
 
 @dataclass
 
@@ -13,6 +13,7 @@ class Scorers(Enum):
   MFExpansionPlusScorer = auto()
   ReputationScorer = auto()
   MFTopicScorer = auto()
+  MFMultiGroupScorer = auto()
 
 
 class Topics(Enum):
 
@@ -1,4 +1,4 @@
-from typing import Dict, List, Optional, Tuple
+from typing import Dict, List, Optional, Set, Tuple
 
 from . import constants as c
 from .mf_base_scorer import MFBaseScorer, coalesce_columns
@@ -13,7 +13,7 @@
 trialScoringGroup = 14
 
 # Mapping of how many threads to assign to each group scorer
-_groupScorerParalleism = {
+groupScorerParalleism = {
   # Group model 13 is larger and benefits from more threads.
   # Others can default to 4.
   13: 8
@@ -32,8 +32,6 @@ def coalesce_group_model_scored_notes(scoredNotes: pd.DataFrame) -> pd.DataFrame
     c.groupNoteInterceptKey,
     c.groupNoteFactor1Key,
     c.groupRatingStatusKey,
-    c.groupNoteInterceptMaxKey,
-    c.groupNoteInterceptMinKey,
     c.modelingGroupKey,
     c.groupInternalActiveRulesKey,
     c.groupNumFinalRoundRatingsKey,
@@ -59,9 +57,9 @@ def coalesce_group_model_helpfulness_scores(helpfulnessScores: pd.DataFrame) ->
 class MFGroupScorer(MFBaseScorer):
   def __init__(
     self,
-    groupNumber: int,
+    includedGroups: Set[int],
+    groupId: int,
     seed: Optional[int] = None,
-    pseudoraters: Optional[bool] = False,
     groupThreshold: float = 0.8,
     saveIntermediateState: bool = False,
     userFactorLambda=None,
@@ -86,6 +84,7 @@ def __init__(
     tagConsensusHarassmentHelpfulRatingPenalty: int = 10,
     tagFilterPercentile: int = 95,
     incorrectFilterThreshold: float = 2.5,
+    threads: int = 4,
   ) -> None:
     """Configure MFGroupScorer object.
 
@@ -104,14 +103,14 @@ def __init__(
         for the model to be active
     """
     super().__init__(
-      includedGroups={groupNumber},
+      includedGroups=includedGroups,
       includeUnassigned=False,
       captureThreshold=groupThreshold,
       seed=seed,
-      pseudoraters=pseudoraters,
+      pseudoraters=False,
       useStableInitialization=False,
       saveIntermediateState=saveIntermediateState,
-      threads=_groupScorerParalleism.get(groupNumber, 4),
+      threads=threads,
       userFactorLambda=userFactorLambda,
       noteFactorLambda=noteFactorLambda,
       userInterceptLambda=userInterceptLambda,
@@ -135,31 +134,30 @@ def __init__(
       tagFilterPercentile=tagFilterPercentile,
       incorrectFilterThreshold=incorrectFilterThreshold,
     )
-    assert groupNumber > 0, "groupNumber must be positive.  0 is reserved for unassigned."
-    assert groupNumber <= groupScorerCount, "groupNumber exceeds maximum expected groups."
-    self._groupNumber = groupNumber
-    self._groupNoteInterceptKey = f"{c.groupNoteInterceptKey}_{self._groupNumber}"
-    self._groupNoteFactor1Key = f"{c.groupNoteFactor1Key}_{self._groupNumber}"
-    self._groupRatingStatusKey = f"{c.groupRatingStatusKey}_{self._groupNumber}"
-    self._groupNoteInterceptMaxKey = f"{c.groupNoteInterceptMaxKey}_{self._groupNumber}"
-    self._groupNoteInterceptMinKey = f"{c.groupNoteInterceptMinKey}_{self._groupNumber}"
-    self._groupInternalActiveRulesKey = f"{c.groupInternalActiveRulesKey}_{self._groupNumber}"
-    self._groupNumFinalRoundRatingsKey = f"{c.groupNumFinalRoundRatingsKey}_{self._groupNumber}"
-    self._groupRaterInterceptKey = f"{c.groupRaterInterceptKey}_{self._groupNumber}"
-    self._groupRaterFactor1Key = f"{c.groupRaterFactor1Key}_{self._groupNumber}"
-    self._modelingGroupKey = f"{c.modelingGroupKey}_{self._groupNumber}"
+    assert groupId > 0, "groupNumber must be positive.  0 is reserved for unassigned."
+    self._groupId = groupId
+    self._init_column_names()
+
+  def _init_column_names(self):
+    """Initialize column names based on prefixes and groupId."""
+    self._groupNoteInterceptKey = f"{c.groupNoteInterceptKey}_{self._groupId}"
+    self._groupNoteFactor1Key = f"{c.groupNoteFactor1Key}_{self._groupId}"
+    self._groupRatingStatusKey = f"{c.groupRatingStatusKey}_{self._groupId}"
+    self._groupInternalActiveRulesKey = f"{c.groupInternalActiveRulesKey}_{self._groupId}"
+    self._groupNumFinalRoundRatingsKey = f"{c.groupNumFinalRoundRatingsKey}_{self._groupId}"
+    self._groupRaterInterceptKey = f"{c.groupRaterInterceptKey}_{self._groupId}"
+    self._groupRaterFactor1Key = f"{c.groupRaterFactor1Key}_{self._groupId}"
+    self._modelingGroupKey = f"{c.modelingGroupKey}_{self._groupId}"
 
   def get_name(self):
-    return f"MFGroupScorer_{self._groupNumber}"
+    return f"MFGroupScorer_{self._groupId}"
 
   def _get_note_col_mapping(self) -> Dict[str, str]:
     """Returns a dict mapping default note column names to custom names for a specific model."""
     return {
       c.internalNoteInterceptKey: self._groupNoteInterceptKey,
       c.internalNoteFactor1Key: self._groupNoteFactor1Key,
       c.internalRatingStatusKey: self._groupRatingStatusKey,
-      c.noteInterceptMinKey: self._groupNoteInterceptMinKey,
-      c.noteInterceptMaxKey: self._groupNoteInterceptMaxKey,
       c.internalActiveRulesKey: self._groupInternalActiveRulesKey,
       c.numFinalRoundRatingsKey: self._groupNumFinalRoundRatingsKey,
       c.lowDiligenceNoteInterceptKey: c.lowDiligenceLegacyNoteInterceptKey,
@@ -179,8 +177,6 @@ def get_scored_notes_cols(self) -> List[str]:
       self._groupNoteInterceptKey,
       self._groupNoteFactor1Key,
       self._groupRatingStatusKey,
-      self._groupNoteInterceptMaxKey,
-      self._groupNoteInterceptMinKey,
       self._groupInternalActiveRulesKey,
       self._modelingGroupKey,
       self._groupNumFinalRoundRatingsKey,
@@ -205,6 +201,8 @@ def _get_dropped_note_cols(self) -> List[str]:
       [
         c.activeFilterTagsKey,
         c.ratingWeightKey,
+        c.noteInterceptMinKey,
+        c.noteInterceptMaxKey,
       ]
       + c.notHelpfulTagsAdjustedColumns
       + c.notHelpfulTagsAdjustedRatioColumns
@@ -261,9 +259,9 @@ def _postprocess_output(
       ),
       how="left",
     )
-    userScores = userScores[userScores[c.modelingGroupKey] == self._groupNumber]
+    userScores = userScores[userScores[c.modelingGroupKey].isin(self._includedGroups)]
     userScores = userScores.drop(columns=c.modelingGroupKey)
     # Set the modelingGroupKey column in each output
-    noteScores[self._modelingGroupKey] = self._groupNumber
-    userScores[self._modelingGroupKey] = self._groupNumber
+    noteScores[self._modelingGroupKey] = self._groupId
+    userScores[self._modelingGroupKey] = self._groupId
     return noteScores, userScores