Merge pull request #176 from twitter/jbaxter/2023_11_29_cleanup

jbaxter · web-flow · commit 9ee120f8d2d2 · 2023-11-29T13:07:02.000-08:00
Cleanup after expansion-plus launch
diff --git a/sourcecode/scoring/constants.py b/sourcecode/scoring/constants.py
@@ -410,21 +410,12 @@ def rater_factor_key(i):
   (timestampOfLastStateChange, np.int64),
   (timestampOfLastEarnOut, np.double),  # double because nullable.
   (modelingPopulationKey, str),
+  (modelingGroupKey, np.float64),
 ]
 userEnrollmentTSVColumns = [col for (col, _) in userEnrollmentTSVColumnsAndTypes]
 userEnrollmentTSVTypes = [dtype for (_, dtype) in userEnrollmentTSVColumnsAndTypes]
 userEnrollmentTSVTypeMapping = {col: dtype for (col, dtype) in userEnrollmentTSVColumnsAndTypes}
 
-# TODO: delete expanded user enrollment definition once modeling group is fully rolled out
-userEnrollmentExpandedTSVColumnsAndTypes = userEnrollmentTSVColumnsAndTypes + [
-  (modelingGroupKey, np.float64)
-]
-userEnrollmentExpandedTSVColumns = [col for (col, _) in userEnrollmentExpandedTSVColumnsAndTypes]
-userEnrollmentExpandedTSVTypes = [dtype for (_, dtype) in userEnrollmentExpandedTSVColumnsAndTypes]
-userEnrollmentExpandedTSVTypeMapping = {
-  col: dtype for (col, dtype) in userEnrollmentExpandedTSVColumnsAndTypes
-}
-
 noteInterceptMaxKey = "internalNoteIntercept_max"
 noteInterceptMinKey = "internalNoteIntercept_min"
 noteParameterUncertaintyTSVMainColumnsAndTypes = [
diff --git a/sourcecode/scoring/mf_core_scorer.py b/sourcecode/scoring/mf_core_scorer.py
@@ -173,10 +173,4 @@ def _filter_input(
     noteStatusHistory = noteStatusHistory[noteStatusHistory[c.noteIdKey].isin(coreNotes)]
     print(f"  Core ratings: {len(ratings)}")
 
-    # Guarantee ordering of ratings and noteStatusHistory remains the same relative to the
-    # original ordering.  This code exists to stabilize system test results and can be removed
-    # once we're confident the rest of the implementation is correct.
-    ratings = ratings.sort_values([c.noteIdKey, c.raterParticipantIdKey])
-    noteStatusHistory = noteStatusHistory.sort_values(c.noteIdKey)
-
     return ratings, noteStatusHistory
diff --git a/sourcecode/scoring/mf_expansion_scorer.py b/sourcecode/scoring/mf_expansion_scorer.py
@@ -153,20 +153,6 @@ def _filter_input(
     )
     print(f"  Ratings after EXPANSION_PLUS notes filter: {len(ratings)}")
 
-    # Guarantee ordering of ratings and noteStatusHistory remains the same relative to the
-    # original ordering.  This code exists to stabilize system test results and can be removed
-    # once we're confident the rest of the implementation is correct.
-    ratingOrder = ratingsOrig[[c.noteIdKey, c.raterParticipantIdKey]].reset_index(drop=False)
-    numRatings = len(ratings)
-    ratings = ratings.merge(ratingOrder, on=[c.noteIdKey, c.raterParticipantIdKey], how="inner")
-    assert len(ratings) == numRatings, f"mismatch: {len(ratings)} != {numRatings}"
-    ratings = ratings.sort_values("index").drop(columns="index")
-    nshOrder = noteStatusHistoryOrig[[c.noteIdKey]].reset_index(drop=False)
-    numNotes = len(noteStatusHistory)
-    noteStatusHistory = noteStatusHistory.merge(nshOrder, on=c.noteIdKey, how="inner")
-    assert len(noteStatusHistory) == numNotes, f"mismatch: {len(noteStatusHistory)} != {numNotes}"
-    noteStatusHistory = noteStatusHistory.sort_values("index").drop(columns="index")
-
     return ratings.drop(columns=_EXPANSION_PLUS_BOOL), noteStatusHistory.drop(
       columns=_EXPANSION_PLUS_BOOL
     )
diff --git a/sourcecode/scoring/process_data.py b/sourcecode/scoring/process_data.py
@@ -71,34 +71,9 @@ def tsv_parser(
     raise ValueError(f"Invalid input: {e}")
 
 
-# TODO: remove this function once modelingGroup column is fully launched
-def user_enrollment_parser(rawTSV: str, header: bool) -> pd.DataFrame:
-  """Parse user enrollment TSV and optinoally tolerate the modelingGroup column.
-
-  Args:
-    rawTSV: str contianing entire TSV input
-    header: bool indicating whether the input will have a header
-
-  Returns:
-    pd.DataFrame containing parsed data
-  """
-  try:
-    df = tsv_parser(rawTSV, c.userEnrollmentTSVTypeMapping, c.userEnrollmentTSVColumns, header)
-    df[c.modelingGroupKey] = 0
-  except ValueError:
-    df = tsv_parser(
-      rawTSV, c.userEnrollmentExpandedTSVTypeMapping, c.userEnrollmentExpandedTSVColumns, header
-    )
-  return df
-
-
-# TODO: remove support for specifying a custom parser once modelingGroup is fully rolled out
-def tsv_reader(path: str, mapping, columns, header=False, parser=tsv_parser):
+def tsv_reader(path: str, mapping, columns, header=False):
   with open(path, "r") as handle:
-    if parser == tsv_parser:
-      return parser(handle.read(), mapping, columns, header)
-    else:
-      return parser(handle.read(), header)
+    return tsv_parser(handle.read(), mapping, columns, header)
 
 
 def read_from_tsv(
@@ -159,13 +134,13 @@ def read_from_tsv(
     userEnrollment = None
   else:
     userEnrollment = tsv_reader(
-      userEnrollmentPath, None, None, header=headers, parser=user_enrollment_parser
+      userEnrollmentPath, c.userEnrollmentTSVTypeMapping, c.userEnrollmentTSVColumns, header=headers
     )
-    assert len(userEnrollment.columns.values) <= len(c.userEnrollmentExpandedTSVColumns) and (
-      len(set(userEnrollment.columns) - set(c.userEnrollmentExpandedTSVColumns)) == 0
+    assert len(userEnrollment.columns.values) == len(c.userEnrollmentTSVColumns) and all(
+      userEnrollment.columns == c.userEnrollmentTSVColumns
     ), (
-      f"userEnrollment columns don't match: \n{[col for col in userEnrollment.columns if not col in c.userEnrollmentExpandedTSVColumns]} are extra columns, "
-      + f"\n{[col for col in c.userEnrollmentExpandedTSVColumns if not col in userEnrollment.columns]} are missing."
+      f"userEnrollment columns don't match: \n{[col for col in userEnrollment.columns if not col in c.userEnrollmentTSVColumns]} are extra columns, "
+      + f"\n{[col for col in c.userEnrollmentTSVColumns if not col in userEnrollment.columns]} are missing."
     )
 
   return notes, ratings, noteStatusHistory, userEnrollment
diff --git a/sourcecode/scoring/run_scoring.py b/sourcecode/scoring/run_scoring.py
@@ -50,15 +50,15 @@ def _get_scorers(
 
   if enabledScorers is None or Scorers.MFCoreScorer in enabledScorers:
     scorers[Scorers.MFCoreScorer] = [
-      MFCoreScorer(seed, pseudoraters, useStableInitialization=useStableInitialization, threads=16)
+      MFCoreScorer(seed, pseudoraters, useStableInitialization=useStableInitialization, threads=12)
     ]
   if enabledScorers is None or Scorers.MFExpansionScorer in enabledScorers:
     scorers[Scorers.MFExpansionScorer] = [
-      MFExpansionScorer(seed, useStableInitialization=useStableInitialization, threads=16)
+      MFExpansionScorer(seed, useStableInitialization=useStableInitialization, threads=12)
     ]
   if enabledScorers is None or Scorers.MFExpansionPlusScorer in enabledScorers:
     scorers[Scorers.MFExpansionPlusScorer] = [
-      MFExpansionPlusScorer(seed, useStableInitialization=useStableInitialization, threads=16)
+      MFExpansionPlusScorer(seed, useStableInitialization=useStableInitialization, threads=12)
     ]
   if enabledScorers is None or Scorers.MFGroupScorer in enabledScorers:
     # Note that index 0 is reserved, corresponding to no group assigned, so scoring group
@@ -651,10 +651,10 @@ def run_scoring(
     maxReruns,
     runParallel=runParallel,
     dataLoader=dataLoader,
-    # Restrict parallelism to 4 processes.  Memory usage scales linearly with the number of
-    # processes and 4 is enough that the limiting factor continues to be the longest running
+    # Restrict parallelism to 6 processes.  Memory usage scales linearly with the number of
+    # processes and 6 is enough that the limiting factor continues to be the longest running
     # scorer (i.e. we would not finish faster with >4 worker processes.)
-    maxWorkers=4,
+    maxWorkers=6,
   )
 
   postScoringStartTime = time.time()