Skip to content

Commit 68df19a

Browse files
committed
Optimize scorer: used shared memory across processes and optimize pseudoraters
Final scoring and prescoring each run about ~10mins faster now when run in parallel on one large CPU machine, due to sharing large dataframes in memory across multiple processes instead of re-reading them. Also, the core scorer itself now runs about ~10mins faster due to cleanup of unused pseudorater computations (uncertainty estimation)
1 parent b539096 commit 68df19a

File tree

4 files changed

+204
-26
lines changed

4 files changed

+204
-26
lines changed

sourcecode/scoring/constants.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -633,6 +633,34 @@ def time_block(label):
633633
print(f"{label} elapsed time: {end - start:.2f} secs ({((end-start)/60.0):.2f} mins)")
634634

635635

636+
@dataclass
637+
class SharedMemoryDataframeInfo:
638+
sharedMemoryName: str
639+
columns: list
640+
dataShape: tuple
641+
dtypesDict: dict
642+
npDtype: str
643+
644+
645+
@dataclass
646+
class ScoringArgsSharedMemory:
647+
noteTopics: SharedMemoryDataframeInfo
648+
ratings: SharedMemoryDataframeInfo
649+
noteStatusHistory: SharedMemoryDataframeInfo
650+
userEnrollment: SharedMemoryDataframeInfo
651+
652+
653+
@dataclass
654+
class PrescoringArgsSharedMemory(ScoringArgsSharedMemory):
655+
pass
656+
657+
658+
@dataclass
659+
class FinalScoringArgsSharedMemory(ScoringArgsSharedMemory):
660+
prescoringNoteModelOutput: SharedMemoryDataframeInfo
661+
prescoringRaterModelOutput: SharedMemoryDataframeInfo
662+
663+
636664
@dataclass
637665
class ScoringArgs:
638666
noteTopics: pd.DataFrame
@@ -641,6 +669,7 @@ class ScoringArgs:
641669
userEnrollment: pd.DataFrame
642670

643671
def remove_large_args_for_multiprocessing(self):
672+
self.noteTopics = None
644673
self.ratings = None
645674
self.noteStatusHistory = None
646675
self.userEnrollment = None

sourcecode/scoring/helpfulness_scores.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -190,7 +190,7 @@ def filter_ratings_by_helpfulness_scores(
190190
filtered_ratings pandas.DataFrame: same schema as input ratings, but filtered.
191191
"""
192192
includedUsers = helpfulnessScores.loc[
193-
helpfulnessScores[c.aboveHelpfulnessThresholdKey], [c.raterParticipantIdKey]
193+
helpfulnessScores[c.aboveHelpfulnessThresholdKey].fillna(False), [c.raterParticipantIdKey]
194194
]
195195
ratingsHelpfulnessScoreFiltered = includedUsers.merge(
196196
ratingsForTraining, on=c.raterParticipantIdKey

sourcecode/scoring/matrix_factorization/pseudo_raters.py

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -85,15 +85,13 @@ def _check_note_parameters_same(self, newMatrixFactorization: MatrixFactorizatio
8585
assert (noteParamsFromNewModel == self.noteParams).all().all()
8686

8787
def _make_extreme_raters(self, raterParams: pd.DataFrame, raterIdMap: pd.DataFrame):
88-
"""Populates self.extremeRaters, which is a list of dicts with rater id info
88+
"""Populates self.extremeRaters, which is a list of dicts with rater id info"""
8989

90-
Args:
91-
raterParams (_type_): _description_
92-
raterIdMap (_type_): _description_
93-
"""
90+
# Because LCB is turned off and we therefore don't use not-helpful pseudoratings anymore,
91+
# we only include the min rater intercept (which produces the highest possible note intercept)
92+
# If we were to use not-helpful pseudoratings, we would also include the max rater intercept.
9493
raterInterceptValues = [
9594
raterParams[c.internalRaterInterceptKey].min(),
96-
raterParams[c.internalRaterInterceptKey].max(),
9795
]
9896
raterFactorValues = [
9997
raterParams[c.internalRaterFactor1Key].min(),
@@ -211,7 +209,8 @@ def _create_extreme_ratings(self):
211209
for extremeRater in self.extremeRaters:
212210
extremeRater[c.raterParticipantIdKey] = str(extremeRater[c.raterParticipantIdKey])
213211

214-
for helpfulNum in (0.0, 1.0):
212+
# Since LCB is turned off, don't waste compute on not-helpful pseudoratings.
213+
for helpfulNum in [1.0]: # Only helpful ratings
215214
extremeRater[c.helpfulNumKey] = helpfulNum
216215
self.extremeRatingsToAddWithoutNotes.append(extremeRater.copy())
217216

sourcecode/scoring/run_scoring.py

Lines changed: 168 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,10 @@
55
intergrated into main files for execution in internal and external environments.
66
"""
77
import concurrent.futures
8+
import copy
89
from itertools import chain
910
import multiprocessing
11+
from multiprocessing import shared_memory # type: ignore
1012
import time
1113
from typing import Callable, Dict, List, Optional, Set, Tuple
1214

@@ -169,29 +171,101 @@ def _merge_results(
169171
return scoredNotes, helpfulnessScores, auxiliaryNoteInfo
170172

171173

174+
def _load_data_with_data_loader_parallelizable(
175+
dataLoader: CommunityNotesDataLoader, scoringArgs: ScoringArgs
176+
) -> ScoringArgs:
177+
"""
178+
Load data from the dataLoader into the scoringArgs object. This function is designed to be run
179+
in a multiprocessing pool,
180+
181+
Deprecated: prefer _load_data_from_shared_memory_parallelizable.
182+
"""
183+
_, ratings, noteStatusHistory, userEnrollment = dataLoader.get_data()
184+
185+
scoringArgs.ratings = ratings
186+
scoringArgs.noteStatusHistory = noteStatusHistory
187+
scoringArgs.userEnrollment = userEnrollment
188+
if type(scoringArgs) == FinalScoringArgs:
189+
prescoringNoteModelOutput, prescoringRaterParams = dataLoader.get_prescoring_model_output()
190+
scoringArgs.prescoringNoteModelOutput = prescoringNoteModelOutput
191+
scoringArgs.prescoringRaterModelOutput = prescoringRaterParams
192+
return scoringArgs
193+
194+
195+
def _load_data_from_shared_memory_parallelizable(
196+
scoringArgsSharedMemory: c.ScoringArgsSharedMemory, scoringArgs: ScoringArgs
197+
) -> ScoringArgs:
198+
"""
199+
Load data from shared memory into the scoringArgs object. This function is designed to be run
200+
in a multiprocessing pool.
201+
"""
202+
scoringArgs.noteTopics = get_df_from_shared_memory(scoringArgsSharedMemory.noteTopics)
203+
scoringArgs.ratings = get_df_from_shared_memory(scoringArgsSharedMemory.ratings)
204+
scoringArgs.noteStatusHistory = get_df_from_shared_memory(
205+
scoringArgsSharedMemory.noteStatusHistory
206+
)
207+
scoringArgs.userEnrollment = get_df_from_shared_memory(scoringArgsSharedMemory.userEnrollment)
208+
209+
if type(scoringArgs) == FinalScoringArgs:
210+
assert type(scoringArgsSharedMemory) == c.FinalScoringArgsSharedMemory
211+
scoringArgs.prescoringNoteModelOutput = get_df_from_shared_memory(
212+
scoringArgsSharedMemory.prescoringNoteModelOutput
213+
)
214+
scoringArgs.prescoringRaterModelOutput = get_df_from_shared_memory(
215+
scoringArgsSharedMemory.prescoringRaterModelOutput
216+
)
217+
return scoringArgs
218+
219+
172220
def _run_scorer_parallelizable(
173221
scorer: Scorer,
174222
runParallel: bool,
175223
scoringArgs: ScoringArgs,
176224
dataLoader: Optional[CommunityNotesDataLoader] = None,
225+
scoringArgsSharedMemory=None,
177226
) -> Tuple[ModelResult, float]:
227+
"""
228+
Run scoring (either prescoring or final scoring) for a single scorer.
229+
This function is designed to be run in a multiprocessing pool, so you can run this function
230+
for each scorer in parallel.
231+
232+
We determine whether to run prescoring or final scoring based on the type of scoringArgs
233+
(PrescoringArgs or FinalScoringArgs).
234+
235+
If runParallel is False, then we read input dataframes from scoringArgs.
236+
237+
If runParallel is True, then we ignore the dataframe attributes of scoringArgs, and read
238+
the input dataframes from shared memory if scoringArgsSharedMemory is not None (preferred),
239+
or from the dataLoader if scoringArgsSharedMemory is None. However, using the dataLoader to
240+
re-read the dataframes from disk is much slower than using shared memory and is deprecated.
241+
"""
242+
scorerStartTime = time.perf_counter()
243+
244+
# Load data if multiprocessing
178245
if runParallel:
179-
assert dataLoader is not None, "must provide a dataLoader to run parallel"
180-
print(f"Since parallel, loading data in run_scoring process for {scorer.get_name()}")
181-
## TODO: also load prescoringNoteModelOutput, raterParamsUnfiltered from data loader.
182-
_, ratings, noteStatusHistory, userEnrollment = dataLoader.get_data()
183-
184-
scoringArgs.ratings = ratings
185-
scoringArgs.noteStatusHistory = noteStatusHistory
186-
scoringArgs.userEnrollment = userEnrollment
187-
if type(scoringArgs) == FinalScoringArgs:
188-
print(
189-
f"Loading prescoring model output for final scoring, in parallel for scorer {scorer.get_name()}."
190-
)
191-
prescoringNoteModelOutput, prescoringRaterParams = dataLoader.get_prescoring_model_output()
192-
scoringArgs.prescoringNoteModelOutput = prescoringNoteModelOutput
193-
scoringArgs.prescoringRaterModelOutput = prescoringRaterParams
246+
with c.time_block(f"{scorer.get_name()} run_scorer_parallelizable: Loading data"):
247+
scoringArgs.remove_large_args_for_multiprocessing() # Should be redundant
248+
scoringArgs = copy.deepcopy(scoringArgs)
249+
250+
if scoringArgsSharedMemory is not None:
251+
print(
252+
f"{scorer.get_name()} run_scorer_parallelizable just started in parallel: loading data from shared memory."
253+
)
254+
scoringArgs = _load_data_from_shared_memory_parallelizable(
255+
scoringArgsSharedMemory, scoringArgs
256+
)
257+
print(f"{scorer.get_name()} run_scorer_parallelizable just finished loading data from shared memory.")
258+
elif dataLoader is not None:
259+
print(
260+
f"{scorer.get_name()} run_scorer_parallelizable just started in parallel: loading data with dataLoader."
261+
)
262+
scoringArgs = _load_data_with_data_loader_parallelizable(dataLoader, scoringArgs)
263+
else:
264+
raise ValueError(
265+
"Must provide either scoringArgsSharedMemory or dataLoader to run parallel"
266+
)
194267

268+
# Run scoring
195269
scorerStartTime = time.perf_counter()
196270
if type(scoringArgs) == PrescoringArgs:
197271
scoringResults = scorer.prescore(scoringArgs)
@@ -204,6 +278,75 @@ def _run_scorer_parallelizable(
204278
return scoringResults, (scorerEndTime - scorerStartTime)
205279

206280

281+
def save_df_to_shared_memory(df: pd.DataFrame, shms: List) -> c.SharedMemoryDataframeInfo:
282+
"""
283+
Intended to be called before beginning multiprocessing: saves the df to shared memory
284+
and returns the info needed to access it, as well as appends it to the list of shared memory objects
285+
so it's not garbage collected and can be closed later.
286+
"""
287+
cols = df.columns
288+
data = df.to_numpy()
289+
df_dtypes_dict = dict(list(zip(df.columns, df.dtypes)))
290+
shm = shared_memory.SharedMemory(create=True, size=data.nbytes)
291+
np_array = np.ndarray(data.shape, dtype=data.dtype, buffer=shm.buf)
292+
np_array[:] = data[:]
293+
shms.append(shm) # save the shared memory object so we can close it later
294+
return c.SharedMemoryDataframeInfo(
295+
sharedMemoryName=shm.name,
296+
columns=cols,
297+
dataShape=data.shape,
298+
dtypesDict=df_dtypes_dict,
299+
npDtype=np_array.dtype,
300+
)
301+
302+
303+
def get_df_from_shared_memory(sharedMemoryDfInfo: c.SharedMemoryDataframeInfo) -> pd.DataFrame:
304+
"""
305+
Intended to be called from a process within a multiprocessing pool in parallel.
306+
Read a dataframe from shared memory and return it.
307+
"""
308+
existing_shm = shared_memory.SharedMemory(name=sharedMemoryDfInfo.sharedMemoryName)
309+
np_array = np.ndarray(
310+
sharedMemoryDfInfo.dataShape, buffer=existing_shm.buf, dtype=sharedMemoryDfInfo.npDtype
311+
)
312+
df = pd.DataFrame(np_array, columns=sharedMemoryDfInfo.columns)
313+
df = df.astype(sharedMemoryDfInfo.dtypesDict)
314+
return df
315+
316+
317+
def _save_dfs_to_shared_memory(
318+
scoringArgs: ScoringArgs,
319+
) -> Tuple[List[shared_memory.SharedMemory], c.ScoringArgsSharedMemory]:
320+
"""
321+
Save large dfs to shared memory. Called before beginning multiprocessing.
322+
"""
323+
shms: List[shared_memory.SharedMemory] = []
324+
noteTopics = save_df_to_shared_memory(scoringArgs.noteTopics, shms)
325+
ratings = save_df_to_shared_memory(scoringArgs.ratings, shms)
326+
noteStatusHistory = save_df_to_shared_memory(scoringArgs.noteStatusHistory, shms)
327+
userEnrollment = save_df_to_shared_memory(scoringArgs.userEnrollment, shms)
328+
329+
if type(scoringArgs) == FinalScoringArgs:
330+
prescoringNoteModelOutput = save_df_to_shared_memory(
331+
scoringArgs.prescoringNoteModelOutput, shms
332+
)
333+
prescoringRaterModelOutput = save_df_to_shared_memory(
334+
scoringArgs.prescoringRaterModelOutput, shms
335+
)
336+
return shms, c.FinalScoringArgsSharedMemory(
337+
noteTopics,
338+
ratings,
339+
noteStatusHistory,
340+
userEnrollment,
341+
prescoringNoteModelOutput,
342+
prescoringRaterModelOutput,
343+
)
344+
else:
345+
return shms, c.PrescoringArgsSharedMemory(
346+
noteTopics, ratings, noteStatusHistory, userEnrollment
347+
)
348+
349+
207350
def _run_scorers(
208351
scorers: List[Scorer],
209352
scoringArgs: ScoringArgs,
@@ -231,10 +374,12 @@ def _run_scorers(
231374
# Apply scoring algorithms
232375
overallStartTime = time.perf_counter()
233376
if runParallel:
377+
shms, scoringArgsSharedMemory = _save_dfs_to_shared_memory(scoringArgs)
378+
234379
with concurrent.futures.ProcessPoolExecutor(
235-
mp_context=multiprocessing.get_context("spawn"), max_workers=maxWorkers
380+
mp_context=multiprocessing.get_context("fork"),
381+
max_workers=maxWorkers,
236382
) as executor:
237-
assert dataLoader is not None
238383
print(f"Starting parallel scorer execution with {len(scorers)} scorers.")
239384
# Pass mostly-empty scoringArgs: the data is too large to be copied in-memory to
240385
# each process, so must be re-loaded from disk by every scorer's dataLoader.
@@ -244,12 +389,17 @@ def _run_scorers(
244389
_run_scorer_parallelizable,
245390
scorer=scorer,
246391
runParallel=True,
392+
scoringArgs=copy.deepcopy(scoringArgs),
247393
dataLoader=dataLoader,
248-
scoringArgs=scoringArgs,
394+
scoringArgsSharedMemory=copy.deepcopy(scoringArgsSharedMemory),
249395
)
250396
for scorer in scorers
251397
]
252398
modelResultsAndTimes = [f.result() for f in futures]
399+
400+
for shm in shms:
401+
shm.close()
402+
shm.unlink() # free the shared memory
253403
else:
254404
modelResultsAndTimes = [
255405
_run_scorer_parallelizable(

0 commit comments

Comments
 (0)