Skip to content

Commit adbc126

Browse files
authored
Merge pull request #220 from twitter/jbaxter/2024_04_26
Freeze rater parameters in final scoring, turn on status locking, parquet output + more column output
2 parents 998fa4b + e02a7ec commit adbc126

12 files changed

+102
-24
lines changed

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,4 @@ pandas==2.1.4
33
torch==2.1.2
44
scipy==1.11.4
55
scikit-learn>=1.3.0
6+
pyarrow

sourcecode/scoring/constants.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,10 +134,12 @@ def rater_factor_key(i):
134134
expansionRatingStatusKey = "expansionRatingStatus"
135135
expansionNoteInterceptMaxKey = "expansionNoteInterceptMax"
136136
expansionNoteInterceptMinKey = "expansionNoteInterceptMin"
137+
expansionInternalActiveRulesKey = "expansionActiveRules"
137138
# ExpansionPlus Model
138139
expansionPlusNoteInterceptKey = "expansionPlusNoteIntercept"
139140
expansionPlusNoteFactor1Key = "expansionPlusNoteFactor1"
140141
expansionPlusRatingStatusKey = "expansionPlusRatingStatus"
142+
expansionPlusInternalActiveRulesKey = "expansionPlusActiveRules"
141143
# Coverage / Helpfulness Reputation Model
142144
coverageNoteInterceptKey = "coverageNoteIntercept"
143145
coverageNoteFactor1Key = "coverageNoteFactor1"
@@ -153,11 +155,13 @@ def rater_factor_key(i):
153155
groupNoteInterceptMinKey = "groupNoteInterceptMin"
154156
groupRaterInterceptKey = "groupRaterIntercept"
155157
groupRaterFactor1Key = "groupRaterFactor1"
158+
groupInternalActiveRulesKey = "groupActiveRules"
156159
# Topic Model
157160
topicNoteInterceptKey = "topicNoteIntercept"
158161
topicNoteFactor1Key = "topicNoteFactor1"
159162
topicRatingStatusKey = "topicRatingStatus"
160163
topicNoteConfidentKey = "topicNoteConfident"
164+
topicInternalActiveRulesKey = "topicActiveRules"
161165
# Harassment/Abuse Tag
162166
harassmentNoteInterceptKey = "harassmentNoteIntercept"
163167
harassmentNoteFactor1Key = "harassmentNoteFactor1"
@@ -558,6 +562,10 @@ def rater_factor_key(i):
558562
(topicRatingStatusKey, str),
559563
(noteTopicKey, str),
560564
(topicNoteConfidentKey, str),
565+
(expansionInternalActiveRulesKey, str),
566+
(expansionPlusInternalActiveRulesKey, str),
567+
(groupInternalActiveRulesKey, str),
568+
(topicInternalActiveRulesKey, str),
561569
]
562570
noteModelOutputTSVColumns = [col for (col, dtype) in noteModelOutputTSVColumnsAndTypes]
563571
noteModelOutputTSVTypeMapping = {col: dtype for (col, dtype) in noteModelOutputTSVColumnsAndTypes}

sourcecode/scoring/matrix_factorization/matrix_factorization.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -443,6 +443,7 @@ def run_mf(
443443
globalInterceptInit: Optional[float] = None,
444444
specificNoteId: Optional[int] = None,
445445
validatePercent: Optional[float] = None,
446+
freezeRaterParameters: bool = False,
446447
):
447448
"""Train matrix factorization model.
448449
@@ -466,6 +467,8 @@ def run_mf(
466467
self._create_mf_model(noteInit, userInit, globalInterceptInit)
467468
assert self.mf_model is not None
468469

470+
if freezeRaterParameters:
471+
self.mf_model._freeze_parameters(set({"user"}))
469472
if specificNoteId is not None:
470473
self.mf_model.freeze_rater_and_global_parameters()
471474
self.prepare_features_and_labels(specificNoteId)

sourcecode/scoring/mf_base_scorer.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -570,6 +570,18 @@ def _prescore_notes_and_users(
570570
if self._saveIntermediateState:
571571
self.helpfulnessScores = helpfulnessScores
572572

573+
## One extra final round!
574+
# Filter ratings based on prev helpfulness scores
575+
finalRoundRatings = helpfulness_scores.filter_ratings_by_helpfulness_scores(
576+
ratingsForTraining, helpfulnessScores
577+
)
578+
# Run MF
579+
noteParamsUnfiltered, raterParamsUnfiltered, globalBias = self._mfRanker.run_mf(
580+
ratings=finalRoundRatings,
581+
noteInit=noteParamsUnfiltered,
582+
userInit=raterParamsUnfiltered,
583+
)
584+
573585
raterModelOutput = raterParamsUnfiltered.merge(
574586
helpfulnessScores[
575587
[
@@ -644,6 +656,8 @@ def _score_notes_and_users(
644656
ratings=finalRoundRatings,
645657
noteInit=prescoringNoteModelOutput,
646658
userInit=prescoringRaterModelOutput,
659+
globalInterceptInit=0.17,
660+
freezeRaterParameters=True,
647661
)
648662

649663
if self._saveIntermediateState:

sourcecode/scoring/mf_expansion_plus_scorer.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ def _get_note_col_mapping(self) -> Dict[str, str]:
3737
c.internalNoteInterceptKey: c.expansionPlusNoteInterceptKey,
3838
c.internalNoteFactor1Key: c.expansionPlusNoteFactor1Key,
3939
c.internalRatingStatusKey: c.expansionPlusRatingStatusKey,
40+
c.internalActiveRulesKey: c.expansionPlusInternalActiveRulesKey,
4041
}
4142

4243
def get_scored_notes_cols(self) -> List[str]:
@@ -46,6 +47,7 @@ def get_scored_notes_cols(self) -> List[str]:
4647
c.expansionPlusNoteInterceptKey,
4748
c.expansionPlusNoteFactor1Key,
4849
c.expansionPlusRatingStatusKey,
50+
c.expansionPlusInternalActiveRulesKey,
4951
]
5052

5153
def get_helpfulness_scores_cols(self) -> List[str]:
@@ -60,7 +62,6 @@ def _get_dropped_note_cols(self) -> List[str]:
6062
"""Returns a list of columns which should be excluded from scoredNotes and auxiliaryNoteInfo."""
6163
return super()._get_dropped_note_cols() + (
6264
[
63-
c.internalActiveRulesKey,
6465
c.activeFilterTagsKey,
6566
c.ratingWeightKey,
6667
c.noteInterceptMinKey,

sourcecode/scoring/mf_expansion_scorer.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ def _get_note_col_mapping(self) -> Dict[str, str]:
4343
c.internalRatingStatusKey: c.expansionRatingStatusKey,
4444
c.noteInterceptMinKey: c.expansionNoteInterceptMinKey,
4545
c.noteInterceptMaxKey: c.expansionNoteInterceptMaxKey,
46+
c.internalActiveRulesKey: c.expansionInternalActiveRulesKey,
4647
}
4748

4849
def get_scored_notes_cols(self) -> List[str]:
@@ -54,6 +55,7 @@ def get_scored_notes_cols(self) -> List[str]:
5455
c.expansionRatingStatusKey,
5556
c.expansionNoteInterceptMinKey,
5657
c.expansionNoteInterceptMaxKey,
58+
c.expansionInternalActiveRulesKey,
5759
]
5860

5961
def get_helpfulness_scores_cols(self) -> List[str]:
@@ -68,7 +70,6 @@ def _get_dropped_note_cols(self) -> List[str]:
6870
"""Returns a list of columns which should be excluded from scoredNotes and auxiliaryNoteInfo."""
6971
return super()._get_dropped_note_cols() + (
7072
[
71-
c.internalActiveRulesKey,
7273
c.activeFilterTagsKey,
7374
c.ratingWeightKey,
7475
]

sourcecode/scoring/mf_group_scorer.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ def coalesce_group_models(
4444
c.groupNoteInterceptMaxKey,
4545
c.groupNoteInterceptMinKey,
4646
c.modelingGroupKey,
47+
c.groupInternalActiveRulesKey,
4748
]:
4849
scoredNotes = coalesce_columns(scoredNotes, col)
4950

@@ -135,6 +136,7 @@ def __init__(
135136
self._groupRatingStatusKey = f"{c.groupRatingStatusKey}_{self._groupNumber}"
136137
self._groupNoteInterceptMaxKey = f"{c.groupNoteInterceptMaxKey}_{self._groupNumber}"
137138
self._groupNoteInterceptMinKey = f"{c.groupNoteInterceptMinKey}_{self._groupNumber}"
139+
self._groupInternalActiveRulesKey = f"{c.groupInternalActiveRulesKey}_{self._groupNumber}"
138140
self._groupRaterInterceptKey = f"{c.groupRaterInterceptKey}_{self._groupNumber}"
139141
self._groupRaterFactor1Key = f"{c.groupRaterFactor1Key}_{self._groupNumber}"
140142
self._modelingGroupKey = f"{c.modelingGroupKey}_{self._groupNumber}"
@@ -151,6 +153,7 @@ def _get_note_col_mapping(self) -> Dict[str, str]:
151153
c.internalRatingStatusKey: self._groupRatingStatusKey,
152154
c.noteInterceptMinKey: self._groupNoteInterceptMinKey,
153155
c.noteInterceptMaxKey: self._groupNoteInterceptMaxKey,
156+
c.internalActiveRulesKey: self._groupInternalActiveRulesKey,
154157
}
155158

156159
def _get_user_col_mapping(self) -> Dict[str, str]:
@@ -169,6 +172,7 @@ def get_scored_notes_cols(self) -> List[str]:
169172
self._groupRatingStatusKey,
170173
self._groupNoteInterceptMaxKey,
171174
self._groupNoteInterceptMinKey,
175+
self._groupInternalActiveRulesKey,
172176
self._modelingGroupKey,
173177
]
174178

@@ -189,7 +193,6 @@ def _get_dropped_note_cols(self) -> List[str]:
189193
"""Returns a list of columns which should be excluded from scoredNotes and auxiliaryNoteInfo."""
190194
return super()._get_dropped_note_cols() + (
191195
[
192-
c.internalActiveRulesKey,
193196
c.activeFilterTagsKey,
194197
c.ratingWeightKey,
195198
]

sourcecode/scoring/mf_topic_scorer.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ def coalesce_topic_models(scoredNotes: pd.DataFrame) -> pd.DataFrame:
2626
c.topicRatingStatusKey,
2727
c.topicNoteConfidentKey,
2828
c.noteTopicKey,
29+
c.topicInternalActiveRulesKey,
2930
]:
3031
scoredNotes = coalesce_columns(scoredNotes, col)
3132

@@ -106,6 +107,7 @@ def __init__(
106107
self._topicNoteInterceptKey = f"{c.topicNoteInterceptKey}_{self._topicName}"
107108
self._topicNoteFactor1Key = f"{c.topicNoteFactor1Key}_{self._topicName}"
108109
self._topicRatingStatusKey = f"{c.topicRatingStatusKey}_{self._topicName}"
110+
self._topicInternalActiveRulesKey = f"{c.topicInternalActiveRulesKey}_{self._topicName}"
109111
self._noteTopicKey = f"{c.noteTopicKey}_{self._topicName}"
110112
self._noteTopicConfidentKey = f"{c.topicNoteConfidentKey}_{self._topicName}"
111113

@@ -118,6 +120,7 @@ def _get_note_col_mapping(self) -> Dict[str, str]:
118120
c.internalNoteInterceptKey: self._topicNoteInterceptKey,
119121
c.internalNoteFactor1Key: self._topicNoteFactor1Key,
120122
c.internalRatingStatusKey: self._topicRatingStatusKey,
123+
c.internalActiveRulesKey: self._topicInternalActiveRulesKey,
121124
}
122125

123126
def get_scored_notes_cols(self) -> List[str]:
@@ -129,6 +132,7 @@ def get_scored_notes_cols(self) -> List[str]:
129132
self._topicRatingStatusKey,
130133
self._noteTopicKey,
131134
self._noteTopicConfidentKey,
135+
self._topicInternalActiveRulesKey,
132136
]
133137

134138
def get_helpfulness_scores_cols(self) -> List[str]:
@@ -143,7 +147,6 @@ def _get_dropped_note_cols(self) -> List[str]:
143147
"""Returns a list of columns which should be excluded from scoredNotes and auxiliaryNoteInfo."""
144148
return super()._get_dropped_note_cols() + (
145149
[
146-
c.internalActiveRulesKey,
147150
c.activeFilterTagsKey,
148151
c.ratingWeightKey,
149152
c.noteInterceptMinKey,

sourcecode/scoring/process_data.py

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -421,15 +421,29 @@ def write_tsv_local(df: pd.DataFrame, path: str) -> None:
421421
Args:
422422
df: pd.DataFrame to write to disk.
423423
path: location of file on disk.
424-
425-
Returns:
426-
None, because path is always None.
427424
"""
428425

429426
assert path is not None
430427
assert df.to_csv(path, index=False, header=True, sep="\t") is None
431428

432429

430+
def write_parquet_local(
431+
df: pd.DataFrame, path: str, compression: str = "snappy", engine: str = "pyarrow"
432+
) -> None:
433+
"""Write DF as a parquet file stored to local disk. Compress with snappy
434+
and use pyarrow engine.
435+
436+
Args:
437+
df: pd.DataFrame to write to disk.
438+
path: location of file on disk.
439+
compression: compression algorithm to use. Defaults to 'snappy'.
440+
engine: engine to use. Defaults to 'pyarrow'.
441+
"""
442+
443+
assert path is not None
444+
df.to_parquet(path, compression=compression, engine=engine)
445+
446+
433447
class CommunityNotesDataLoader(ABC):
434448
"""Base class which local and prod data loaders extend.
435449

sourcecode/scoring/run_scoring.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -254,7 +254,9 @@ def _run_scorer_parallelizable(
254254
scoringArgs = _load_data_from_shared_memory_parallelizable(
255255
scoringArgsSharedMemory, scoringArgs
256256
)
257-
print(f"{scorer.get_name()} run_scorer_parallelizable just finished loading data from shared memory.")
257+
print(
258+
f"{scorer.get_name()} run_scorer_parallelizable just finished loading data from shared memory."
259+
)
258260
elif dataLoader is not None:
259261
print(
260262
f"{scorer.get_name()} run_scorer_parallelizable just started in parallel: loading data with dataLoader."
@@ -522,19 +524,25 @@ def meta_score(
522524
# MFExpansionPlusScorer will have the lowest priority.
523525
rules.append(
524526
scoring_rules.ApplyModelResult(
525-
RuleID.EXPANSION_PLUS_MODEL, {RuleID.META_INITIAL_NMR}, c.expansionPlusRatingStatusKey
527+
RuleID.EXPANSION_PLUS_MODEL,
528+
{RuleID.META_INITIAL_NMR},
529+
c.expansionPlusRatingStatusKey,
526530
)
527531
)
528532
if enabledScorers is None or Scorers.MFExpansionScorer in enabledScorers:
529533
rules.append(
530534
scoring_rules.ApplyModelResult(
531-
RuleID.EXPANSION_MODEL, {RuleID.META_INITIAL_NMR}, c.expansionRatingStatusKey
535+
RuleID.EXPANSION_MODEL,
536+
{RuleID.META_INITIAL_NMR},
537+
c.expansionRatingStatusKey,
532538
)
533539
)
534540
if enabledScorers is None or Scorers.MFCoreScorer in enabledScorers:
535541
rules.append(
536542
scoring_rules.ApplyModelResult(
537-
RuleID.CORE_MODEL, {RuleID.META_INITIAL_NMR}, c.coreRatingStatusKey
543+
RuleID.CORE_MODEL,
544+
{RuleID.META_INITIAL_NMR},
545+
c.coreRatingStatusKey,
538546
)
539547
)
540548
if enabledScorers is None or Scorers.MFGroupScorer in enabledScorers:

sourcecode/scoring/runner.py

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,12 @@
33

44
from . import constants as c
55
from .enums import scorers_from_csv
6-
from .process_data import LocalDataLoader, write_prescoring_output, write_tsv_local
6+
from .process_data import (
7+
LocalDataLoader,
8+
write_parquet_local,
9+
write_prescoring_output,
10+
write_tsv_local,
11+
)
712
from .run_scoring import run_scoring
813

914

@@ -84,6 +89,13 @@ def parse_args():
8489
dest="prescoring_delay_hours",
8590
help="Filter prescoring input to simulate delay in hours",
8691
)
92+
parser.add_argument(
93+
"--no-parquet",
94+
help="Disable writing parquet files.",
95+
default=False,
96+
action="store_true",
97+
dest="no_parquet",
98+
)
8799

88100
return parser.parse_args()
89101

@@ -138,6 +150,12 @@ def prescoring_write_fn(notePath, raterPath):
138150
write_tsv_local(newStatus, os.path.join(args.outdir, "note_status_history.tsv"))
139151
write_tsv_local(auxNoteInfo, os.path.join(args.outdir, "aux_note_info.tsv"))
140152

153+
if not args.no_parquet:
154+
write_parquet_local(scoredNotes, os.path.join(args.outdir, "scored_notes.parquet"))
155+
write_parquet_local(helpfulnessScores, os.path.join(args.outdir, "helpfulness_scores.parquet"))
156+
write_parquet_local(newStatus, os.path.join(args.outdir, "note_status_history.parquet"))
157+
write_parquet_local(auxNoteInfo, os.path.join(args.outdir, "aux_note_info.parquet"))
158+
141159

142160
if __name__ == "__main__":
143161
main()

0 commit comments

Comments
 (0)