Skip to content

Commit 97e2b08

Browse files
committed
make hash does not need to be part of basic memoizer
and is more reusable when it isn't this isn't the only way to make a hash though. and hashing isn't the only way to compare checkpoint entries for equality.
1 parent e1cf1c6 commit 97e2b08

File tree

1 file changed

+37
-36
lines changed

1 file changed

+37
-36
lines changed

parsl/dataflow/memoization.py

Lines changed: 37 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,42 @@ def id_for_memo_function(f: types.FunctionType, output_ref: bool = False) -> byt
121121
return pickle.dumps(["types.FunctionType", f.__name__, f.__module__])
122122

123123

124+
def make_hash(task: TaskRecord) -> str:
125+
"""Create a hash of the task inputs.
126+
127+
Args:
128+
- task (dict) : Task dictionary from dfk.tasks
129+
130+
Returns:
131+
- hash (str) : A unique hash string
132+
"""
133+
134+
t: List[bytes] = []
135+
136+
# if kwargs contains an outputs parameter, that parameter is removed
137+
# and normalised differently - with output_ref set to True.
138+
# kwargs listed in ignore_for_cache will also be removed
139+
140+
filtered_kw = task['kwargs'].copy()
141+
142+
ignore_list = task['ignore_for_cache']
143+
144+
logger.debug("Ignoring these kwargs for checkpointing: %s", ignore_list)
145+
for k in ignore_list:
146+
logger.debug("Ignoring kwarg %s", k)
147+
del filtered_kw[k]
148+
149+
if 'outputs' in task['kwargs']:
150+
outputs = task['kwargs']['outputs']
151+
del filtered_kw['outputs']
152+
t.append(id_for_memo(outputs, output_ref=True))
153+
154+
t.extend(map(id_for_memo, (filtered_kw, task['func'], task['args'])))
155+
156+
x = b''.join(t)
157+
return hashlib.md5(x).hexdigest()
158+
159+
124160
class Memoizer:
125161
def start(self, *, dfk: DataFlowKernel, memoize: bool = True, checkpoint_files: Sequence[str], run_dir: str) -> None:
126162
raise NotImplementedError
@@ -200,41 +236,6 @@ def start(self, *, dfk: DataFlowKernel, memoize: bool = True, checkpoint_files:
200236
logger.info("App caching disabled for all apps")
201237
self.memo_lookup_table = {}
202238

203-
def make_hash(self, task: TaskRecord) -> str:
204-
"""Create a hash of the task inputs.
205-
206-
Args:
207-
- task (dict) : Task dictionary from dfk.tasks
208-
209-
Returns:
210-
- hash (str) : A unique hash string
211-
"""
212-
213-
t: List[bytes] = []
214-
215-
# if kwargs contains an outputs parameter, that parameter is removed
216-
# and normalised differently - with output_ref set to True.
217-
# kwargs listed in ignore_for_cache will also be removed
218-
219-
filtered_kw = task['kwargs'].copy()
220-
221-
ignore_list = task['ignore_for_cache']
222-
223-
logger.debug("Ignoring these kwargs for checkpointing: %s", ignore_list)
224-
for k in ignore_list:
225-
logger.debug("Ignoring kwarg %s", k)
226-
del filtered_kw[k]
227-
228-
if 'outputs' in task['kwargs']:
229-
outputs = task['kwargs']['outputs']
230-
del filtered_kw['outputs']
231-
t.append(id_for_memo(outputs, output_ref=True))
232-
233-
t.extend(map(id_for_memo, (filtered_kw, task['func'], task['args'])))
234-
235-
x = b''.join(t)
236-
return hashlib.md5(x).hexdigest()
237-
238239
def check_memo(self, task: TaskRecord) -> Optional[Future[Any]]:
239240
"""Create a hash of the task and its inputs and check the lookup table for this hash.
240241
@@ -256,7 +257,7 @@ def check_memo(self, task: TaskRecord) -> Optional[Future[Any]]:
256257
logger.debug("Task {} will not be memoized".format(task_id))
257258
return None
258259

259-
hashsum = self.make_hash(task)
260+
hashsum = make_hash(task)
260261
logger.debug("Task {} has memoization hash {}".format(task_id, hashsum))
261262
result = None
262263
if hashsum in self.memo_lookup_table:

0 commit comments

Comments
 (0)