Skip to content

Commit 956d8bd

Browse files
authored
Merge pull request #256 from twitter/jbaxter/2024_08_12
Multi-group models, Support NMRDueToStableCRHTime, & more
2 parents 779c728 + 9cf9458 commit 956d8bd

8 files changed

+540
-165
lines changed

Diff for: sourcecode/scoring/constants.py

+77-36
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from enum import Enum
44
import os
55
import time
6-
from typing import Dict, Optional
6+
from typing import Dict, Optional, Set
77

88
import numpy as np
99
import pandas as pd
@@ -34,12 +34,17 @@
3434
intervalHalfWidth = 0.3
3535

3636
# Max flip rates
37-
prescoringAllUnlockedNotesMaxCrhChurn = 0.04
38-
finalUnlockedNotesWithNoNewRatingsMaxCrhChurn = 0.03
37+
prescoringAllUnlockedNotesMaxCrhChurn = 0.2
38+
prescoringAllNotesCreatedThreeToThirteenDaysAgoMaxChurn = 0.06
39+
finalUnlockedNotesWithNoNewRatingsMaxCrhChurn = 0.05
3940
finalNotesWithNewRatingsMaxNewCrhChurn = 0.80
4041
finalNotesWithNewRatingsMaxOldCrhChurn = 0.25
4142
finalNotesThatJustFlippedStatusMaxCrhChurn = 1e8
4243
finalNotesThatFlippedRecentlyMaxCrhChurn = 1e8
44+
# TODO(jiansongc): adjust these 2 below
45+
finalNotesNmrDueToMinStableCrhTimeMaxOldCrhChurn = 1.0
46+
finalNotesNmrDueToMinStableCrhTimeMaxNewCrhChurn = 1.0
47+
4348

4449
# Data Filenames
4550
scoredNotesOutputPath = "scoredNotes.tsv"
@@ -59,17 +64,14 @@
5964
authorTopNotHelpfulTagValues = "authorTopNotHelpfulTagValues"
6065
modelingPopulationKey = "modelingPopulation"
6166
modelingGroupKey = "modelingGroup"
67+
modelingMultiGroupKey = "modelingMultiGroup"
6268
numberOfTimesEarnedOutKey = "numberOfTimesEarnedOut"
6369
defaultIndexKey = "index"
6470

6571
# Scoring Groups
66-
coreGroups = {1, 2, 3, 6, 8, 9, 10, 11, 13, 14, 19, 21, 25}
67-
expansionGroups = (
68-
# Divide into 3 grouping aggregates to prepare for multi-group models,
69-
# and a 4th group containing leftovers
70-
{0, 15, 17, 24, 29, 30} | {4, 5, 7, 12, 26} | {27} | {16, 20, 22, 23, 28}
71-
)
72-
expansionPlusGroups = {18}
72+
coreGroups: Set[int] = {1, 2, 3, 6, 8, 9, 10, 11, 13, 14, 19, 21, 25}
73+
expansionGroups: Set[int] = {0, 4, 5, 7, 12, 16, 18, 20, 22, 23, 24, 26, 27, 28}
74+
expansionPlusGroups: Set[int] = {15, 17, 29, 30}
7375

7476
# TSV Values
7577
notHelpfulValueTsv = "NOT_HELPFUL"
@@ -193,6 +195,14 @@ def rater_factor_key(i):
193195
groupRaterFactor1Key = "groupRaterFactor1"
194196
groupInternalActiveRulesKey = "groupActiveRules"
195197
groupNumFinalRoundRatingsKey = "groupNumFinalRoundRatings"
198+
# MultiGroup Model
199+
multiGroupNoteInterceptKey = "multiGroupNoteIntercept"
200+
multiGroupNoteFactor1Key = "multiGroupNoteFactor1"
201+
multiGroupRatingStatusKey = "multiGroupRatingStatus"
202+
multiGroupRaterInterceptKey = "multiGroupRaterIntercept"
203+
multiGroupRaterFactor1Key = "multiGroupRaterFactor1"
204+
multiGroupInternalActiveRulesKey = "multiGroupActiveRules"
205+
multiGroupNumFinalRoundRatingsKey = "multiGroupNumFinalRoundRatings"
196206
# Topic Model
197207
topicNoteInterceptKey = "topicNoteIntercept"
198208
topicNoteFactor1Key = "topicNoteFactor1"
@@ -445,6 +455,12 @@ def rater_factor_key(i):
445455
currentDecidedByKey = "currentDecidedBy"
446456
currentModelingGroupKey = "currentModelingGroup"
447457
timestampMillisOfMostRecentStatusChangeKey = "timestampMillisOfMostRecentStatusChange"
458+
currentMultiGroupStatusKey = "currentMultiGroupStatus"
459+
currentModelingMultiGroupKey = "currentModelingMultiGroup"
460+
timestampMillisOfNmrDueToMinStableCrhTimeKey = "timestampMillisOfNmrDueToMinStableCrhTime"
461+
updatedTimestampMillisOfNmrDueToMinStableCrhTimeKey = (
462+
"updatedTimestampMillisOfNmrDueToMinStableCrhTime"
463+
)
448464

449465
noteStatusHistoryTSVColumnsAndTypes = [
450466
(noteIdKey, np.int64),
@@ -465,12 +481,22 @@ def rater_factor_key(i):
465481
(currentDecidedByKey, "category"),
466482
(currentModelingGroupKey, np.double), # TODO: int
467483
(timestampMillisOfMostRecentStatusChangeKey, np.double), # double because nullable.
484+
(timestampMillisOfNmrDueToMinStableCrhTimeKey, np.double), # double because nullable.
485+
(currentMultiGroupStatusKey, "category"),
486+
(currentModelingMultiGroupKey, np.double), # TODO: int
468487
]
469488
noteStatusHistoryTSVColumns = [col for (col, dtype) in noteStatusHistoryTSVColumnsAndTypes]
470489
noteStatusHistoryTSVTypes = [dtype for (col, dtype) in noteStatusHistoryTSVColumnsAndTypes]
471490
noteStatusHistoryTSVTypeMapping = {
472491
col: dtype for (col, dtype) in noteStatusHistoryTSVColumnsAndTypes
473492
}
493+
# TODO(jiansongc): clean up after new column is in production.
494+
noteStatusHistoryTSVColumnsOld = noteStatusHistoryTSVColumns[:-1]
495+
noteStatusHistoryTSVColumnsAndTypesOld = noteStatusHistoryTSVColumnsAndTypes[:-1]
496+
noteStatusHistoryTSVTypeMappingOld = {
497+
col: dtype for (col, dtype) in noteStatusHistoryTSVColumnsAndTypesOld
498+
}
499+
474500

475501
# Earn In + Earn Out
476502
enrollmentState = "enrollmentState"
@@ -587,6 +613,8 @@ def rater_factor_key(i):
587613
{
588614
coverageNoteInterceptMinKey,
589615
coverageNoteInterceptMaxKey,
616+
groupNoteInterceptMinKey,
617+
groupNoteInterceptMaxKey,
590618
}
591619
)
592620

@@ -610,58 +638,64 @@ def rater_factor_key(i):
610638
(noteIdKey, np.int64),
611639
(coreNoteInterceptKey, np.double),
612640
(coreNoteFactor1Key, np.double),
613-
(finalRatingStatusKey, str),
614-
(firstTagKey, str),
615-
(secondTagKey, str),
641+
(finalRatingStatusKey, "category"),
642+
(firstTagKey, "category"),
643+
(secondTagKey, "category"),
616644
# Note that this column was formerly named "activeRules" and the name is now
617645
# updated to "coreActiveRules". The data values remain the compatible,
618646
# but the new column only contains rules that ran when deciding status based on
619647
# the core model.
620-
(coreActiveRulesKey, str),
621-
(activeFilterTagsKey, str),
622-
(classificationKey, str),
648+
(coreActiveRulesKey, "category"),
649+
(activeFilterTagsKey, "category"),
650+
(classificationKey, "category"),
623651
(createdAtMillisKey, np.int64),
624-
(coreRatingStatusKey, str),
625-
(metaScorerActiveRulesKey, str),
626-
(decidedByKey, str),
652+
(coreRatingStatusKey, "category"),
653+
(metaScorerActiveRulesKey, "category"),
654+
(decidedByKey, "category"),
627655
(expansionNoteInterceptKey, np.double),
628656
(expansionNoteFactor1Key, np.double),
629-
(expansionRatingStatusKey, str),
657+
(expansionRatingStatusKey, "category"),
630658
(coverageNoteInterceptKey, np.double),
631659
(coverageNoteFactor1Key, np.double),
632-
(coverageRatingStatusKey, str),
660+
(coverageRatingStatusKey, "category"),
633661
(coreNoteInterceptMinKey, np.double),
634662
(coreNoteInterceptMaxKey, np.double),
635-
(expansionNoteInterceptMinKey, np.double),
636-
(expansionNoteInterceptMaxKey, np.double),
637-
(coverageNoteInterceptMinKey, np.double),
638-
(coverageNoteInterceptMaxKey, np.double),
663+
(expansionNoteInterceptMinKey, "category"), # category because always nan
664+
(expansionNoteInterceptMaxKey, "category"), # category because always nan
665+
(coverageNoteInterceptMinKey, "category"), # category because always nan
666+
(coverageNoteInterceptMaxKey, "category"), # category because always nan
639667
(groupNoteInterceptKey, np.double),
640668
(groupNoteFactor1Key, np.double),
641-
(groupRatingStatusKey, str),
642-
(groupNoteInterceptMaxKey, np.double),
643-
(groupNoteInterceptMinKey, np.double),
669+
(groupRatingStatusKey, "category"),
670+
(groupNoteInterceptMaxKey, "category"), # category because always nan
671+
(groupNoteInterceptMinKey, "category"), # category because always nan
644672
(modelingGroupKey, np.float64),
645673
(numRatingsKey, np.int64),
646674
(timestampMillisOfNoteCurrentLabelKey, np.double),
647675
(expansionPlusNoteInterceptKey, np.double),
648676
(expansionPlusNoteFactor1Key, np.double),
649-
(expansionPlusRatingStatusKey, str),
677+
(expansionPlusRatingStatusKey, "category"),
650678
(topicNoteInterceptKey, np.double),
651679
(topicNoteFactor1Key, np.double),
652-
(topicRatingStatusKey, str),
653-
(noteTopicKey, str),
680+
(topicRatingStatusKey, "category"),
681+
(noteTopicKey, "category"),
654682
(topicNoteConfidentKey, pd.BooleanDtype()),
655-
(expansionInternalActiveRulesKey, str),
656-
(expansionPlusInternalActiveRulesKey, str),
657-
(groupInternalActiveRulesKey, str),
658-
(topicInternalActiveRulesKey, str),
683+
(expansionInternalActiveRulesKey, "category"),
684+
(expansionPlusInternalActiveRulesKey, "category"),
685+
(groupInternalActiveRulesKey, "category"),
686+
(topicInternalActiveRulesKey, "category"),
659687
(coreNumFinalRoundRatingsKey, np.double), # double because nullable.
660688
(expansionNumFinalRoundRatingsKey, np.double), # double because nullable.
661689
(expansionPlusNumFinalRoundRatingsKey, np.double), # double because nullable.
662690
(groupNumFinalRoundRatingsKey, np.double), # double because nullable.
663691
(topicNumFinalRoundRatingsKey, np.double), # double because nullable.
664-
(rescoringActiveRulesKey, str),
692+
(rescoringActiveRulesKey, "category"),
693+
(multiGroupNoteInterceptKey, np.double),
694+
(multiGroupNoteFactor1Key, np.double),
695+
(multiGroupRatingStatusKey, str),
696+
(modelingMultiGroupKey, np.float64),
697+
(multiGroupInternalActiveRulesKey, str),
698+
(multiGroupNumFinalRoundRatingsKey, np.double), # double because nullable.
665699
]
666700
noteModelOutputTSVColumns = [col for (col, dtype) in noteModelOutputTSVColumnsAndTypes]
667701
noteModelOutputTSVTypeMapping = {col: dtype for (col, dtype) in noteModelOutputTSVColumnsAndTypes}
@@ -733,6 +767,9 @@ def rater_factor_key(i):
733767
(expansionRaterFactor1Key, np.double),
734768
(expansionPlusRaterInterceptKey, np.double),
735769
(expansionPlusRaterFactor1Key, np.double),
770+
(multiGroupRaterInterceptKey, np.double),
771+
(multiGroupRaterFactor1Key, np.double),
772+
(modelingMultiGroupKey, np.float64),
736773
]
737774
raterModelOutputTSVColumns = [col for (col, dtype) in raterModelOutputTSVColumnsAndTypes]
738775
raterModelOutputTSVTypeMapping = {col: dtype for (col, dtype) in raterModelOutputTSVColumnsAndTypes}
@@ -781,6 +818,8 @@ def rater_factor_key(i):
781818
inputPathsTSVColumns = [col for (col, _) in inputPathsTSVColumnsAndTypes]
782819
inputPathsTSVTypeMapping = {col: dtype for (col, dtype) in inputPathsTSVColumnsAndTypes}
783820

821+
timestampMinuteOfFinalScoringOutput = "timestampMinuteOfFinalScoringOutput"
822+
784823

785824
@contextmanager
786825
def time_block(label):
@@ -888,6 +927,8 @@ class RescoringRuleID(Enum):
888927
NOTES_FLIPPED_PREVIOUS_RUN = 3
889928
NEW_NOTES_NOT_RESCORED_RECENTLY_ENOUGH = 4
890929
RECENTLY_FLIPPED_NOTES_NOT_RESCORED_RECENTLY_ENOUGH = 5
930+
NMR_DUE_TO_MIN_STABLE_CRH_TIME = 6
931+
NOTES_CREATED_SOMEWHAT_RECENTLY = 7
891932

892933

893934
@dataclass

Diff for: sourcecode/scoring/enums.py

+1
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ class Scorers(Enum):
1313
MFExpansionPlusScorer = auto()
1414
ReputationScorer = auto()
1515
MFTopicScorer = auto()
16+
MFMultiGroupScorer = auto()
1617

1718

1819
class Topics(Enum):

Diff for: sourcecode/scoring/mf_group_scorer.py

+28-30
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from typing import Dict, List, Optional, Tuple
1+
from typing import Dict, List, Optional, Set, Tuple
22

33
from . import constants as c
44
from .mf_base_scorer import MFBaseScorer, coalesce_columns
@@ -13,7 +13,7 @@
1313
trialScoringGroup = 14
1414

1515
# Mapping of how many threads to assign to each group scorer
16-
_groupScorerParalleism = {
16+
groupScorerParalleism = {
1717
# Group model 13 is larger and benefits from more threads.
1818
# Others can default to 4.
1919
13: 8
@@ -32,8 +32,6 @@ def coalesce_group_model_scored_notes(scoredNotes: pd.DataFrame) -> pd.DataFrame
3232
c.groupNoteInterceptKey,
3333
c.groupNoteFactor1Key,
3434
c.groupRatingStatusKey,
35-
c.groupNoteInterceptMaxKey,
36-
c.groupNoteInterceptMinKey,
3735
c.modelingGroupKey,
3836
c.groupInternalActiveRulesKey,
3937
c.groupNumFinalRoundRatingsKey,
@@ -59,9 +57,9 @@ def coalesce_group_model_helpfulness_scores(helpfulnessScores: pd.DataFrame) ->
5957
class MFGroupScorer(MFBaseScorer):
6058
def __init__(
6159
self,
62-
groupNumber: int,
60+
includedGroups: Set[int],
61+
groupId: int,
6362
seed: Optional[int] = None,
64-
pseudoraters: Optional[bool] = False,
6563
groupThreshold: float = 0.8,
6664
saveIntermediateState: bool = False,
6765
userFactorLambda=None,
@@ -86,6 +84,7 @@ def __init__(
8684
tagConsensusHarassmentHelpfulRatingPenalty: int = 10,
8785
tagFilterPercentile: int = 95,
8886
incorrectFilterThreshold: float = 2.5,
87+
threads: int = 4,
8988
) -> None:
9089
"""Configure MFGroupScorer object.
9190
@@ -104,14 +103,14 @@ def __init__(
104103
for the model to be active
105104
"""
106105
super().__init__(
107-
includedGroups={groupNumber},
106+
includedGroups=includedGroups,
108107
includeUnassigned=False,
109108
captureThreshold=groupThreshold,
110109
seed=seed,
111-
pseudoraters=pseudoraters,
110+
pseudoraters=False,
112111
useStableInitialization=False,
113112
saveIntermediateState=saveIntermediateState,
114-
threads=_groupScorerParalleism.get(groupNumber, 4),
113+
threads=threads,
115114
userFactorLambda=userFactorLambda,
116115
noteFactorLambda=noteFactorLambda,
117116
userInterceptLambda=userInterceptLambda,
@@ -135,31 +134,30 @@ def __init__(
135134
tagFilterPercentile=tagFilterPercentile,
136135
incorrectFilterThreshold=incorrectFilterThreshold,
137136
)
138-
assert groupNumber > 0, "groupNumber must be positive. 0 is reserved for unassigned."
139-
assert groupNumber <= groupScorerCount, "groupNumber exceeds maximum expected groups."
140-
self._groupNumber = groupNumber
141-
self._groupNoteInterceptKey = f"{c.groupNoteInterceptKey}_{self._groupNumber}"
142-
self._groupNoteFactor1Key = f"{c.groupNoteFactor1Key}_{self._groupNumber}"
143-
self._groupRatingStatusKey = f"{c.groupRatingStatusKey}_{self._groupNumber}"
144-
self._groupNoteInterceptMaxKey = f"{c.groupNoteInterceptMaxKey}_{self._groupNumber}"
145-
self._groupNoteInterceptMinKey = f"{c.groupNoteInterceptMinKey}_{self._groupNumber}"
146-
self._groupInternalActiveRulesKey = f"{c.groupInternalActiveRulesKey}_{self._groupNumber}"
147-
self._groupNumFinalRoundRatingsKey = f"{c.groupNumFinalRoundRatingsKey}_{self._groupNumber}"
148-
self._groupRaterInterceptKey = f"{c.groupRaterInterceptKey}_{self._groupNumber}"
149-
self._groupRaterFactor1Key = f"{c.groupRaterFactor1Key}_{self._groupNumber}"
150-
self._modelingGroupKey = f"{c.modelingGroupKey}_{self._groupNumber}"
137+
assert groupId > 0, "groupNumber must be positive. 0 is reserved for unassigned."
138+
self._groupId = groupId
139+
self._init_column_names()
140+
141+
def _init_column_names(self):
142+
"""Initialize column names based on prefixes and groupId."""
143+
self._groupNoteInterceptKey = f"{c.groupNoteInterceptKey}_{self._groupId}"
144+
self._groupNoteFactor1Key = f"{c.groupNoteFactor1Key}_{self._groupId}"
145+
self._groupRatingStatusKey = f"{c.groupRatingStatusKey}_{self._groupId}"
146+
self._groupInternalActiveRulesKey = f"{c.groupInternalActiveRulesKey}_{self._groupId}"
147+
self._groupNumFinalRoundRatingsKey = f"{c.groupNumFinalRoundRatingsKey}_{self._groupId}"
148+
self._groupRaterInterceptKey = f"{c.groupRaterInterceptKey}_{self._groupId}"
149+
self._groupRaterFactor1Key = f"{c.groupRaterFactor1Key}_{self._groupId}"
150+
self._modelingGroupKey = f"{c.modelingGroupKey}_{self._groupId}"
151151

152152
def get_name(self):
153-
return f"MFGroupScorer_{self._groupNumber}"
153+
return f"MFGroupScorer_{self._groupId}"
154154

155155
def _get_note_col_mapping(self) -> Dict[str, str]:
156156
"""Returns a dict mapping default note column names to custom names for a specific model."""
157157
return {
158158
c.internalNoteInterceptKey: self._groupNoteInterceptKey,
159159
c.internalNoteFactor1Key: self._groupNoteFactor1Key,
160160
c.internalRatingStatusKey: self._groupRatingStatusKey,
161-
c.noteInterceptMinKey: self._groupNoteInterceptMinKey,
162-
c.noteInterceptMaxKey: self._groupNoteInterceptMaxKey,
163161
c.internalActiveRulesKey: self._groupInternalActiveRulesKey,
164162
c.numFinalRoundRatingsKey: self._groupNumFinalRoundRatingsKey,
165163
c.lowDiligenceNoteInterceptKey: c.lowDiligenceLegacyNoteInterceptKey,
@@ -179,8 +177,6 @@ def get_scored_notes_cols(self) -> List[str]:
179177
self._groupNoteInterceptKey,
180178
self._groupNoteFactor1Key,
181179
self._groupRatingStatusKey,
182-
self._groupNoteInterceptMaxKey,
183-
self._groupNoteInterceptMinKey,
184180
self._groupInternalActiveRulesKey,
185181
self._modelingGroupKey,
186182
self._groupNumFinalRoundRatingsKey,
@@ -205,6 +201,8 @@ def _get_dropped_note_cols(self) -> List[str]:
205201
[
206202
c.activeFilterTagsKey,
207203
c.ratingWeightKey,
204+
c.noteInterceptMinKey,
205+
c.noteInterceptMaxKey,
208206
]
209207
+ c.notHelpfulTagsAdjustedColumns
210208
+ c.notHelpfulTagsAdjustedRatioColumns
@@ -261,9 +259,9 @@ def _postprocess_output(
261259
),
262260
how="left",
263261
)
264-
userScores = userScores[userScores[c.modelingGroupKey] == self._groupNumber]
262+
userScores = userScores[userScores[c.modelingGroupKey].isin(self._includedGroups)]
265263
userScores = userScores.drop(columns=c.modelingGroupKey)
266264
# Set the modelingGroupKey column in each output
267-
noteScores[self._modelingGroupKey] = self._groupNumber
268-
userScores[self._modelingGroupKey] = self._groupNumber
265+
noteScores[self._modelingGroupKey] = self._groupId
266+
userScores[self._modelingGroupKey] = self._groupId
269267
return noteScores, userScores

0 commit comments

Comments
 (0)