3
3
from enum import Enum
4
4
import os
5
5
import time
6
- from typing import Dict , Optional
6
+ from typing import Dict , Optional , Set
7
7
8
8
import numpy as np
9
9
import pandas as pd
34
34
intervalHalfWidth = 0.3
35
35
36
36
# Max flip rates
37
- prescoringAllUnlockedNotesMaxCrhChurn = 0.04
38
- finalUnlockedNotesWithNoNewRatingsMaxCrhChurn = 0.03
37
+ prescoringAllUnlockedNotesMaxCrhChurn = 0.2
38
+ prescoringAllNotesCreatedThreeToThirteenDaysAgoMaxChurn = 0.06
39
+ finalUnlockedNotesWithNoNewRatingsMaxCrhChurn = 0.05
39
40
finalNotesWithNewRatingsMaxNewCrhChurn = 0.80
40
41
finalNotesWithNewRatingsMaxOldCrhChurn = 0.25
41
42
finalNotesThatJustFlippedStatusMaxCrhChurn = 1e8
42
43
finalNotesThatFlippedRecentlyMaxCrhChurn = 1e8
44
+ # TODO(jiansongc): adjust these 2 below
45
+ finalNotesNmrDueToMinStableCrhTimeMaxOldCrhChurn = 1.0
46
+ finalNotesNmrDueToMinStableCrhTimeMaxNewCrhChurn = 1.0
47
+
43
48
44
49
# Data Filenames
45
50
scoredNotesOutputPath = "scoredNotes.tsv"
59
64
authorTopNotHelpfulTagValues = "authorTopNotHelpfulTagValues"
60
65
modelingPopulationKey = "modelingPopulation"
61
66
modelingGroupKey = "modelingGroup"
67
+ modelingMultiGroupKey = "modelingMultiGroup"
62
68
numberOfTimesEarnedOutKey = "numberOfTimesEarnedOut"
63
69
defaultIndexKey = "index"
64
70
65
71
# Scoring Groups
66
- coreGroups = {1 , 2 , 3 , 6 , 8 , 9 , 10 , 11 , 13 , 14 , 19 , 21 , 25 }
67
- expansionGroups = (
68
- # Divide into 3 grouping aggregates to prepare for multi-group models,
69
- # and a 4th group containing leftovers
70
- {0 , 15 , 17 , 24 , 29 , 30 } | {4 , 5 , 7 , 12 , 26 } | {27 } | {16 , 20 , 22 , 23 , 28 }
71
- )
72
- expansionPlusGroups = {18 }
72
+ coreGroups : Set [int ] = {1 , 2 , 3 , 6 , 8 , 9 , 10 , 11 , 13 , 14 , 19 , 21 , 25 }
73
+ expansionGroups : Set [int ] = {0 , 4 , 5 , 7 , 12 , 16 , 18 , 20 , 22 , 23 , 24 , 26 , 27 , 28 }
74
+ expansionPlusGroups : Set [int ] = {15 , 17 , 29 , 30 }
73
75
74
76
# TSV Values
75
77
notHelpfulValueTsv = "NOT_HELPFUL"
@@ -193,6 +195,14 @@ def rater_factor_key(i):
193
195
groupRaterFactor1Key = "groupRaterFactor1"
194
196
groupInternalActiveRulesKey = "groupActiveRules"
195
197
groupNumFinalRoundRatingsKey = "groupNumFinalRoundRatings"
198
+ # MultiGroup Model
199
+ multiGroupNoteInterceptKey = "multiGroupNoteIntercept"
200
+ multiGroupNoteFactor1Key = "multiGroupNoteFactor1"
201
+ multiGroupRatingStatusKey = "multiGroupRatingStatus"
202
+ multiGroupRaterInterceptKey = "multiGroupRaterIntercept"
203
+ multiGroupRaterFactor1Key = "multiGroupRaterFactor1"
204
+ multiGroupInternalActiveRulesKey = "multiGroupActiveRules"
205
+ multiGroupNumFinalRoundRatingsKey = "multiGroupNumFinalRoundRatings"
196
206
# Topic Model
197
207
topicNoteInterceptKey = "topicNoteIntercept"
198
208
topicNoteFactor1Key = "topicNoteFactor1"
@@ -445,6 +455,12 @@ def rater_factor_key(i):
445
455
currentDecidedByKey = "currentDecidedBy"
446
456
currentModelingGroupKey = "currentModelingGroup"
447
457
timestampMillisOfMostRecentStatusChangeKey = "timestampMillisOfMostRecentStatusChange"
458
+ currentMultiGroupStatusKey = "currentMultiGroupStatus"
459
+ currentModelingMultiGroupKey = "currentModelingMultiGroup"
460
+ timestampMillisOfNmrDueToMinStableCrhTimeKey = "timestampMillisOfNmrDueToMinStableCrhTime"
461
+ updatedTimestampMillisOfNmrDueToMinStableCrhTimeKey = (
462
+ "updatedTimestampMillisOfNmrDueToMinStableCrhTime"
463
+ )
448
464
449
465
noteStatusHistoryTSVColumnsAndTypes = [
450
466
(noteIdKey , np .int64 ),
@@ -465,12 +481,22 @@ def rater_factor_key(i):
465
481
(currentDecidedByKey , "category" ),
466
482
(currentModelingGroupKey , np .double ), # TODO: int
467
483
(timestampMillisOfMostRecentStatusChangeKey , np .double ), # double because nullable.
484
+ (timestampMillisOfNmrDueToMinStableCrhTimeKey , np .double ), # double because nullable.
485
+ (currentMultiGroupStatusKey , "category" ),
486
+ (currentModelingMultiGroupKey , np .double ), # TODO: int
468
487
]
469
488
noteStatusHistoryTSVColumns = [col for (col , dtype ) in noteStatusHistoryTSVColumnsAndTypes ]
470
489
noteStatusHistoryTSVTypes = [dtype for (col , dtype ) in noteStatusHistoryTSVColumnsAndTypes ]
471
490
noteStatusHistoryTSVTypeMapping = {
472
491
col : dtype for (col , dtype ) in noteStatusHistoryTSVColumnsAndTypes
473
492
}
493
+ # TODO(jiansongc): clean up after new column is in production.
494
+ noteStatusHistoryTSVColumnsOld = noteStatusHistoryTSVColumns [:- 1 ]
495
+ noteStatusHistoryTSVColumnsAndTypesOld = noteStatusHistoryTSVColumnsAndTypes [:- 1 ]
496
+ noteStatusHistoryTSVTypeMappingOld = {
497
+ col : dtype for (col , dtype ) in noteStatusHistoryTSVColumnsAndTypesOld
498
+ }
499
+
474
500
475
501
# Earn In + Earn Out
476
502
enrollmentState = "enrollmentState"
@@ -587,6 +613,8 @@ def rater_factor_key(i):
587
613
{
588
614
coverageNoteInterceptMinKey ,
589
615
coverageNoteInterceptMaxKey ,
616
+ groupNoteInterceptMinKey ,
617
+ groupNoteInterceptMaxKey ,
590
618
}
591
619
)
592
620
@@ -610,58 +638,64 @@ def rater_factor_key(i):
610
638
(noteIdKey , np .int64 ),
611
639
(coreNoteInterceptKey , np .double ),
612
640
(coreNoteFactor1Key , np .double ),
613
- (finalRatingStatusKey , str ),
614
- (firstTagKey , str ),
615
- (secondTagKey , str ),
641
+ (finalRatingStatusKey , "category" ),
642
+ (firstTagKey , "category" ),
643
+ (secondTagKey , "category" ),
616
644
# Note that this column was formerly named "activeRules" and the name is now
617
645
# updated to "coreActiveRules". The data values remain the compatible,
618
646
# but the new column only contains rules that ran when deciding status based on
619
647
# the core model.
620
- (coreActiveRulesKey , str ),
621
- (activeFilterTagsKey , str ),
622
- (classificationKey , str ),
648
+ (coreActiveRulesKey , "category" ),
649
+ (activeFilterTagsKey , "category" ),
650
+ (classificationKey , "category" ),
623
651
(createdAtMillisKey , np .int64 ),
624
- (coreRatingStatusKey , str ),
625
- (metaScorerActiveRulesKey , str ),
626
- (decidedByKey , str ),
652
+ (coreRatingStatusKey , "category" ),
653
+ (metaScorerActiveRulesKey , "category" ),
654
+ (decidedByKey , "category" ),
627
655
(expansionNoteInterceptKey , np .double ),
628
656
(expansionNoteFactor1Key , np .double ),
629
- (expansionRatingStatusKey , str ),
657
+ (expansionRatingStatusKey , "category" ),
630
658
(coverageNoteInterceptKey , np .double ),
631
659
(coverageNoteFactor1Key , np .double ),
632
- (coverageRatingStatusKey , str ),
660
+ (coverageRatingStatusKey , "category" ),
633
661
(coreNoteInterceptMinKey , np .double ),
634
662
(coreNoteInterceptMaxKey , np .double ),
635
- (expansionNoteInterceptMinKey , np . double ),
636
- (expansionNoteInterceptMaxKey , np . double ),
637
- (coverageNoteInterceptMinKey , np . double ),
638
- (coverageNoteInterceptMaxKey , np . double ),
663
+ (expansionNoteInterceptMinKey , "category" ), # category because always nan
664
+ (expansionNoteInterceptMaxKey , "category" ), # category because always nan
665
+ (coverageNoteInterceptMinKey , "category" ), # category because always nan
666
+ (coverageNoteInterceptMaxKey , "category" ), # category because always nan
639
667
(groupNoteInterceptKey , np .double ),
640
668
(groupNoteFactor1Key , np .double ),
641
- (groupRatingStatusKey , str ),
642
- (groupNoteInterceptMaxKey , np . double ),
643
- (groupNoteInterceptMinKey , np . double ),
669
+ (groupRatingStatusKey , "category" ),
670
+ (groupNoteInterceptMaxKey , "category" ), # category because always nan
671
+ (groupNoteInterceptMinKey , "category" ), # category because always nan
644
672
(modelingGroupKey , np .float64 ),
645
673
(numRatingsKey , np .int64 ),
646
674
(timestampMillisOfNoteCurrentLabelKey , np .double ),
647
675
(expansionPlusNoteInterceptKey , np .double ),
648
676
(expansionPlusNoteFactor1Key , np .double ),
649
- (expansionPlusRatingStatusKey , str ),
677
+ (expansionPlusRatingStatusKey , "category" ),
650
678
(topicNoteInterceptKey , np .double ),
651
679
(topicNoteFactor1Key , np .double ),
652
- (topicRatingStatusKey , str ),
653
- (noteTopicKey , str ),
680
+ (topicRatingStatusKey , "category" ),
681
+ (noteTopicKey , "category" ),
654
682
(topicNoteConfidentKey , pd .BooleanDtype ()),
655
- (expansionInternalActiveRulesKey , str ),
656
- (expansionPlusInternalActiveRulesKey , str ),
657
- (groupInternalActiveRulesKey , str ),
658
- (topicInternalActiveRulesKey , str ),
683
+ (expansionInternalActiveRulesKey , "category" ),
684
+ (expansionPlusInternalActiveRulesKey , "category" ),
685
+ (groupInternalActiveRulesKey , "category" ),
686
+ (topicInternalActiveRulesKey , "category" ),
659
687
(coreNumFinalRoundRatingsKey , np .double ), # double because nullable.
660
688
(expansionNumFinalRoundRatingsKey , np .double ), # double because nullable.
661
689
(expansionPlusNumFinalRoundRatingsKey , np .double ), # double because nullable.
662
690
(groupNumFinalRoundRatingsKey , np .double ), # double because nullable.
663
691
(topicNumFinalRoundRatingsKey , np .double ), # double because nullable.
664
- (rescoringActiveRulesKey , str ),
692
+ (rescoringActiveRulesKey , "category" ),
693
+ (multiGroupNoteInterceptKey , np .double ),
694
+ (multiGroupNoteFactor1Key , np .double ),
695
+ (multiGroupRatingStatusKey , str ),
696
+ (modelingMultiGroupKey , np .float64 ),
697
+ (multiGroupInternalActiveRulesKey , str ),
698
+ (multiGroupNumFinalRoundRatingsKey , np .double ), # double because nullable.
665
699
]
666
700
noteModelOutputTSVColumns = [col for (col , dtype ) in noteModelOutputTSVColumnsAndTypes ]
667
701
noteModelOutputTSVTypeMapping = {col : dtype for (col , dtype ) in noteModelOutputTSVColumnsAndTypes }
@@ -733,6 +767,9 @@ def rater_factor_key(i):
733
767
(expansionRaterFactor1Key , np .double ),
734
768
(expansionPlusRaterInterceptKey , np .double ),
735
769
(expansionPlusRaterFactor1Key , np .double ),
770
+ (multiGroupRaterInterceptKey , np .double ),
771
+ (multiGroupRaterFactor1Key , np .double ),
772
+ (modelingMultiGroupKey , np .float64 ),
736
773
]
737
774
raterModelOutputTSVColumns = [col for (col , dtype ) in raterModelOutputTSVColumnsAndTypes ]
738
775
raterModelOutputTSVTypeMapping = {col : dtype for (col , dtype ) in raterModelOutputTSVColumnsAndTypes }
@@ -781,6 +818,8 @@ def rater_factor_key(i):
781
818
inputPathsTSVColumns = [col for (col , _ ) in inputPathsTSVColumnsAndTypes ]
782
819
inputPathsTSVTypeMapping = {col : dtype for (col , dtype ) in inputPathsTSVColumnsAndTypes }
783
820
821
+ timestampMinuteOfFinalScoringOutput = "timestampMinuteOfFinalScoringOutput"
822
+
784
823
785
824
@contextmanager
786
825
def time_block (label ):
@@ -888,6 +927,8 @@ class RescoringRuleID(Enum):
888
927
NOTES_FLIPPED_PREVIOUS_RUN = 3
889
928
NEW_NOTES_NOT_RESCORED_RECENTLY_ENOUGH = 4
890
929
RECENTLY_FLIPPED_NOTES_NOT_RESCORED_RECENTLY_ENOUGH = 5
930
+ NMR_DUE_TO_MIN_STABLE_CRH_TIME = 6
931
+ NOTES_CREATED_SOMEWHAT_RECENTLY = 7
891
932
892
933
893
934
@dataclass
0 commit comments