Skip to content

Commit ba3d305

Browse files
author
Bhargav Dodla
committed
fix: Improve logging for spark materialization engine
1 parent 402a383 commit ba3d305

File tree

1 file changed

+96
-7
lines changed

1 file changed

+96
-7
lines changed

sdk/python/feast/infra/materialization/contrib/spark/spark_materialization_engine.py

Lines changed: 96 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -234,15 +234,25 @@ def _map_by_partition(
234234
spark_serialized_artifacts: _SparkSerializedArtifacts,
235235
):
236236
feature_view, online_store, repo_config = spark_serialized_artifacts.unserialize()
237+
238+
total_batches = 0
239+
total_time = 0.0
240+
min_time = float("inf")
241+
max_time = float("-inf")
242+
243+
total_rows = 0
244+
min_batch_size = float("inf")
245+
max_batch_size = float("-inf")
246+
237247
"""Load pandas df to online store"""
238248
for pdf in iterator:
249+
start_time = time.perf_counter()
239250
pdf_row_count = pdf.shape[0]
240-
start_time = time.time()
241-
# convert to pyarrow table
242251
if pdf_row_count == 0:
243-
print("INFO!!! Dataframe has 0 records to process")
244-
return
252+
print("INFO: Dataframe has 0 records to process")
253+
break
245254

255+
# convert to pyarrow table
246256
table = pyarrow.Table.from_pandas(pdf)
247257

248258
if feature_view.batch_source.field_mapping is not None:
@@ -266,10 +276,89 @@ def _map_by_partition(
266276
rows_to_write,
267277
lambda x: None,
268278
)
269-
end_time = time.time()
270-
print(
271-
f"INFO!!! Processed batch with size {pdf_row_count} in {int((end_time - start_time) * 1000)} milliseconds"
279+
280+
batch_time = time.perf_counter() - start_time
281+
282+
(
283+
total_batches,
284+
total_time,
285+
min_time,
286+
max_time,
287+
total_rows,
288+
min_batch_size,
289+
max_batch_size,
290+
) = update_exec_stats(
291+
total_batches,
292+
total_time,
293+
min_time,
294+
max_time,
295+
total_rows,
296+
min_batch_size,
297+
max_batch_size,
298+
batch_time,
299+
pdf_row_count,
272300
)
301+
302+
if total_batches > 0:
303+
print_exec_stats(
304+
total_batches,
305+
total_time,
306+
min_time,
307+
max_time,
308+
total_rows,
309+
min_batch_size,
310+
max_batch_size,
311+
)
312+
273313
yield pd.DataFrame(
274314
[pd.Series(range(1, 2))]
275315
) # dummy result because mapInPandas needs to return something
316+
317+
318+
def update_exec_stats(
319+
total_batches,
320+
total_time,
321+
min_time,
322+
max_time,
323+
total_rows,
324+
min_batch_size,
325+
max_batch_size,
326+
batch_time,
327+
current_batch_size,
328+
):
329+
total_batches += 1
330+
total_time += batch_time
331+
min_time = min(min_time, batch_time)
332+
max_time = max(max_time, batch_time)
333+
334+
total_rows += current_batch_size
335+
min_batch_size = min(min_batch_size, current_batch_size)
336+
max_batch_size = max(max_batch_size, current_batch_size)
337+
338+
return (
339+
total_batches,
340+
total_time,
341+
min_time,
342+
max_time,
343+
total_rows,
344+
min_batch_size,
345+
max_batch_size,
346+
)
347+
348+
349+
def print_exec_stats(
350+
total_batches,
351+
total_time,
352+
min_time,
353+
max_time,
354+
total_rows,
355+
min_batch_size,
356+
max_batch_size,
357+
):
358+
# TODO: Investigate why the logger is not working in Spark Executors
359+
avg_time = total_time / total_batches
360+
avg_batch_size = total_rows / total_batches
361+
print(
362+
f"Time - Total: {total_time:.6f}s, Avg: {avg_time:.6f}s, Min: {min_time:.6f}s, Max: {max_time:.6f}s | "
363+
f"Batch Size - Total: {total_rows}, Avg: {avg_batch_size:.2f}, Min: {min_batch_size}, Max: {max_batch_size}"
364+
)

0 commit comments

Comments
 (0)