From 7d9008a4538deb22334b4f47e9a5a842a537881d Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Wed, 5 Feb 2025 18:24:13 +0100 Subject: [PATCH] Add benchmarks for reductions, and plots for different archs --- bench/ndarray/jit-reduc-float64-plot.py | 169 ++++++++++++++++++++++++ bench/ndarray/jit-reduc-sizes.py | 140 ++++++++++++++++++++ bench/ndarray/run-jit-reduc-sizes.sh | 5 + 3 files changed, 314 insertions(+) create mode 100644 bench/ndarray/jit-reduc-float64-plot.py create mode 100644 bench/ndarray/jit-reduc-sizes.py create mode 100644 bench/ndarray/run-jit-reduc-sizes.sh diff --git a/bench/ndarray/jit-reduc-float64-plot.py b/bench/ndarray/jit-reduc-float64-plot.py new file mode 100644 index 00000000..b9baacc2 --- /dev/null +++ b/bench/ndarray/jit-reduc-float64-plot.py @@ -0,0 +1,169 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# This source code is licensed under a BSD-style license (found in the +# LICENSE file in the root directory of this source tree) +####################################################################### + +# Plots for the jit vs. numpy benchmarks on different array sizes and platforms. + +import matplotlib.pyplot as plt +import plotly.graph_objects as go + +plotly = True + +sizes = [1, 5, 10, 20, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100, 105, 110, 115, 120] +sizes_GB = [n * 1000 * n * 1000 * 8 * 2 / 2**30 for n in sizes] + +intel = False +amd = True +m2linux = False + +# Load the data +if amd: + title_ = "AMD Ryzen 9 9800X3D (64 GB RAM)" + title_ = "np.sum(((a ** 3 + np.sin(a * 2)) < c) & (b > 0), axis=1)" + + create_clevel0 = [ 0.0325, 0.2709, 1.0339, 4.0489, 9.0849, 12.4154, 16.7818, 25.5946, 47.5691, 35.9919, 45.4295, 93.3075, 66.6529 ] + compute_clevel0 = [ 0.0017, 0.0243, 0.0869, 0.3370, 0.7665, 1.0375, 1.3727, 1.7377, 2.1472, 2.6205, 3.0435, 18.5878, 28.0816 ] + + create_clevel0_ooc = [ 0.0305, 0.3371, 1.3249, 5.0602, 11.0410, 16.3685, 22.2012, 27.1348, 31.7409, 38.0690, 47.4424, 56.9335, 62.6965 ] + compute_clevel0_ooc = [ 0.0019, 0.0243, 0.0885, 0.3434, 0.7761, 1.0724, 1.4082, 1.7373, 2.1827, 2.6124, 7.0940, 9.0734, 10.1089 ] + + create_LZ4_l1 = [ 0.0304, 0.2582, 1.0298, 3.9502, 8.8945, 11.9267, 16.3965, 20.2368, 24.6837, 29.3425, 36.2631, 42.1709, 48.0605, 52.3962, 61.5175, 68.6328, 80.1160, 85.4322, 97.1122, 106.9973, 114.8584 ] + compute_LZ4_l1 = [ 0.0018, 0.0210, 0.0756, 0.3003, 0.6609, 0.8886, 1.1285, 1.4453, 1.7959, 2.1889, 2.6978, 3.1586, 3.4286, 3.9929, 4.4590, 5.3601, 5.6702, 6.4690, 6.9764, 7.8714, 8.6404 ] + + create_LZ4_l1_ooc = [ 1.7980, 0.2617, 1.0480, 4.0809, 9.0720, 13.8294, 16.7269, 20.5108, 24.9465, 30.0428, 37.1903, 42.8075, 48.7775, 52.9890, 63.4071, 70.1766, 81.9747, 88.1830, 97.7921, 111.0611, 119.7673 ] + compute_LZ4_l1_ooc = [ 0.0019, 0.0214, 0.0795, 0.3060, 0.6985, 0.9195, 1.1766, 1.5213, 1.8845, 2.2972, 2.8044, 3.2587, 3.5898, 4.1524, 4.6293, 5.5485, 5.8715, 6.7386, 7.3019, 8.2307, 9.0145 ] + + create_ZSTD_l1 = [ 0.0302, 0.2704, 1.0703, 4.1243, 9.2185, 12.5026, 17.0585, 20.8708, 25.5844, 31.0571, 37.7114, 42.8297, 50.2696, 54.5773, 63.6311, 73.0370, 84.0092, 89.0686, 100.3300, 108.8173, 119.1154 ] + compute_ZSTD_l1 = [ 0.0021, 0.0296, 0.1045, 0.3979, 0.8787, 1.3064, 1.7404, 2.1938, 2.6780, 3.3929, 3.8601, 4.3665, 5.0127, 5.7346, 6.1056, 7.9448, 8.2872, 9.4659, 9.2376, 10.4273, 11.6572 ] + + create_ZSTD_l1_ooc = [ 0.6564, 0.2825, 1.0826, 4.1968, 9.5022, 13.4840, 17.5387, 21.5807, 26.0052, 31.3524, 38.5889, 44.1105, 49.8849, 55.5297, 64.6479, 72.7471, 84.6595, 90.4970, 99.9710, 111.6817, 120.8941 ] + compute_ZSTD_l1_ooc = [ 0.0022, 0.0300, 0.1066, 0.4099, 0.8974, 1.3218, 1.7679, 2.2154, 2.7007, 3.4267, 3.9255, 4.4597, 5.1155, 5.8251, 6.2064, 8.0141, 8.4316, 9.3195, 9.4570, 10.7034, 11.9192 ] + + create_numpy = [ 0.0020, 0.0527, 0.2292, 0.9412, 2.1043, 2.8286, 3.7046, 4.7217, 5.8308, 7.0491 ] + compute_numpy = [ 0.0179, 0.2495, 0.9840, 3.9263, 8.8450, 12.0259, 16.3507, 40.1672, 155.1292, 302.5115 ] + + create_numpy_jit = [ 0.0019, 0.0529, 0.2261, 0.9219, 2.0589, 2.8350, 3.7131, 18.4375, 26.5959, 34.5221, 33.7157, 49.6762, 63.1401 ] + compute_numpy_jit = [ 0.0035, 0.0180, 0.0622, 0.2307, 0.5196, 0.7095, 0.9251, 1.1981, 1.4729, 2.2007, 2.0953, 12.6746, 26.6424 ] + +elif intel: + title_ = "Intel Core i9-13900K (32 GB RAM)" + create_clevel0 = [ 0.1810, 0.3511, 1.1511, 4.4575, 10.3164, 17.4344, 24.4274, 37.7116, 36.6179, 53.7264 ] + compute_clevel0 = [ 0.0045, 0.0133, 0.0506, 0.2086, 0.4603, 0.8689, 1.1458, 1.4150, 1.7656, 1.9475 ] + + create_LZ4_l1 = [ 0.1834, 0.3457, 1.1234, 4.3301, 10.0406, 16.9509, 22.1617, 26.3818, 32.4472, 39.3830, 41.9484, 52.6316 ] + compute_LZ4_l1 = [ 0.0014, 0.0128, 0.0494, 0.1958, 0.4387, 0.8207, 1.0208, 1.2739, 1.5062, 1.7446, 2.1553, 2.4458 ] + + create_ZSTD_l1 = [ 0.0362, 0.3734, 1.2009, 4.5362, 10.3706, 18.7104, 23.1148, 27.6572, 33.7207, 41.0326, 44.2322, 54.9467 ] + compute_ZSTD_l1 = [ 0.0028, 0.0193, 0.0799, 0.2226, 0.4983, 0.9072, 1.1624, 1.4375, 1.8162, 2.0918, 2.5067, 2.7760 ] + + create_numpy = [ 0.0046, 0.1160, 0.4327, 1.7166, 3.8661, 7.5005, 14.1090, 18.6720, 64.2425, 108.4532, 529.1393, 962.1662 ] + compute_numpy = [ 0.0240, 0.1920, 0.7217, 2.9316, 6.5893, 14.0353, 47.4275, 99.0893, 187.8040, 202.3973, 460.5915, 551.2776 ] + +elif m2linux: + title_ = "MacBook Air M2 (24 GB RAM)" + create_LZ4_l1 = [0.021555185317993164, 0.2862977981567383, 0.8696625232696533, 3.4979920387268066, 8.235799789428711, 13.708781242370605, 20.74394702911377, 33.23137378692627] + compute_LZ4_l1 = [0.0033464431762695312, 0.03627762794494629, 0.14009513854980468, 0.5438736915588379, 1.2493964672088622, 2.194223642349243, 3.5851136207580567, 5.067658472061157] + + create_numpy = [0.0016903877258300781, 0.04910874366760254, 0.18264532089233398, 0.7124006748199463, 1.8350563049316406, 8.877023935317993, 101.2457287311554, 196.21723294258118] + compute_numpy = [0.003887462615966797, 0.026979732513427734, 0.11047358512878418, 0.4213367462158203, 0.9288184165954589, 1.6470709323883057, 5.5601390361785885, 9.401740503311157] + +else: + title_ = "Mac Mini M4 Pro (24 GB RAM)" + + create_numpy = [ 0.0024, 0.0686, 0.2857, 1.1800, 3.5006, 13.7092, 21.9491, 30.5237, 101.3553, 363.5005, 446.5876, 1509.1826 ] + compute_numpy = [ 0.0046, 0.1173, 0.5066, 2.0908, 5.6268, 13.0679, 16.7926, 20.9192, 25.5899, 34.5382, 46.0664, 1083.7046 ] + + create_numpy_jit = [ 0.0024, 0.0686, 0.2857, 1.1800, 3.5006, 13.7092, 21.9491, 30.5237, 101.3553, 363.5005, 446.5876, 1509.1826 ] + compute_numpy_jit = [ ] + + create_clevel0 = [ 0.0321, 0.6742, 1.6750, 5.1412, 12.4538, 20.6695, 28.0185, 34.7422, 41.3935, 50.4275, 59.5572, 71.3740 ] + compute_clevel0 = [ 0.0017, 0.0224, 0.0761, 0.2439, 0.5827, 0.9816, 1.2265, 6.8712, 9.6391, 11.2875, 13.1953, 15.4047 ] + + create_LZ4_l1 = [ 0.0316, 0.6974, 1.6957, 5.2077, 11.8975, 20.2346, 26.6419, 32.7686, 38.2088, 47.5221, 55.7224, 66.6138 ] + compute_LZ4_l1 = [ 0.0019, 0.0234, 0.0763, 0.2421, 0.5533, 0.9384, 1.2476, 1.5299, 1.8564, 2.1836, 2.5633, 2.9245 ] + + create_ZSTD_l1 = [ 0.0354, 0.7105, 1.7217, 5.2663, 12.7385, 20.3693, 28.1353, 33.0310, 40.3843, 50.2020, 58.0643, 69.5190 ] + compute_ZSTD_l1 = [ 0.0021, 0.0225, 0.0773, 0.2630, 0.8735, 1.0087, 1.9553, 1.5670, 3.1986, 3.3499, 4.0728, 4.6287 ] + + +# Plot the data. There will be 2 plots: one for create times and another for compute times +labels = ['lvl=0', 'LZ4 lvl=1', 'ZSTD lvl=1', 'NumPy', "numpy_jit"] +if plotly: + # Create the create times plot + fig_create = go.Figure() + fig_create.add_trace( + go.Scatter(x=sizes_GB, y=create_clevel0, mode='lines+markers', name=labels[0])) + fig_create.add_trace( + go.Scatter(x=sizes_GB, y=create_clevel0_ooc, mode='lines+markers', name=labels[0] + "(ooc)")) + fig_create.add_trace( + go.Scatter(x=sizes_GB, y=create_LZ4_l1, mode='lines+markers', name=labels[1])) + fig_create.add_trace( + go.Scatter(x=sizes_GB, y=create_LZ4_l1_ooc, mode='lines+markers', name=labels[1] + "(ooc)")) + fig_create.add_trace( + go.Scatter(x=sizes_GB, y=create_ZSTD_l1, mode='lines+markers', name=labels[2])) + fig_create.add_trace( + go.Scatter(x=sizes_GB, y=create_ZSTD_l1_ooc, mode='lines+markers', name=labels[2] + "(ooc)")) + fig_create.add_trace( + go.Scatter(x=sizes_GB, y=create_numpy_jit, mode='lines+markers', name=labels[3])) + fig_create.update_layout(title=f'Create operands times ({title_})', xaxis_title='Size (GB)', yaxis_title='Time (s)') + + # Create the compute times plot + # Calculate the maximum y1 value + y1_max = max(max(compute_clevel0), max(compute_clevel0_ooc), max(compute_LZ4_l1), max(compute_LZ4_l1_ooc), + max(compute_ZSTD_l1), max(compute_ZSTD_l1_ooc), max(compute_numpy), max(compute_numpy_jit)) + + fig_compute = go.Figure() + fig_compute.add_trace( + go.Scatter(x=sizes_GB, y=compute_clevel0, mode='lines+markers', name=labels[0])) + fig_compute.add_trace( + go.Scatter(x=sizes_GB, y=compute_clevel0_ooc, mode='lines+markers', name=labels[0] + "(ooc)")) + fig_compute.add_trace( + go.Scatter(x=sizes_GB, y=compute_LZ4_l1, mode='lines+markers', name=labels[1])) + fig_compute.add_trace( + go.Scatter(x=sizes_GB, y=compute_LZ4_l1_ooc, mode='lines+markers', name=labels[1] + "(ooc)")) + fig_compute.add_trace( + go.Scatter(x=sizes_GB, y=compute_ZSTD_l1, mode='lines+markers', name=labels[2])) + fig_compute.add_trace( + go.Scatter(x=sizes_GB, y=compute_ZSTD_l1_ooc, mode='lines+markers', name=labels[2] + "(ooc)")) + fig_compute.add_trace(go.Scatter(x=sizes_GB, y=compute_numpy, mode='lines+markers', name=labels[3])) + fig_compute.add_trace(go.Scatter(x=sizes_GB, y=compute_numpy_jit, mode='lines+markers', name=labels[4])) + fig_compute.update_layout(title=f'Compute times ({title_})', xaxis_title='Size (GB)', yaxis_title='Time (s)') + + # Add a vertical line at 64 GB + y1_max = 35 + fig_compute.add_shape( + type="line", x0=64, y0=0, x1=64, y1=y1_max, + line=dict(color="Gray", width=2, dash="dot"), + ) + if amd: + fig_compute.add_annotation(x=64, y=y1_max * .9, text="64 GB", showarrow=True, arrowhead=2, ax=40, ay=0) + + # Show the plots + fig_create.show() + fig_compute.show() +else: + plt.figure() + plt.plot(sizes_GB, create_clevel0, "o-", label=labels[0]) + plt.plot(sizes_GB, create_LZ4_l1, "o-", label=labels[1]) + plt.plot(sizes_GB, create_ZSTD_l1, "o-", label=labels[2]) + plt.plot(sizes_GB, create_numpy_jit, "o-", label=labels[3]) + plt.xlabel("Size (GB)") + plt.ylabel("Time (s)") + plt.title(f"Create times ({title_})") + plt.legend() + # Now, the compute times + plt.figure() + plt.plot(sizes_GB, compute_clevel0, "o-", label=labels[0]) + plt.plot(sizes_GB, compute_LZ4_l1, "o-", label=labels[1]) + plt.plot(sizes_GB, compute_ZSTD_l1, "o-", label=labels[2]) + plt.plot(sizes_GB, compute_numpy, "o-", label=labels[3]) + plt.plot(sizes_GB, compute_numpy_jit, "o-", label=labels[4]) + plt.xlabel("Size (GB)") + plt.ylabel("Time (s)") + plt.title(f"Compute times ({title_})") + plt.legend() + plt.show() diff --git a/bench/ndarray/jit-reduc-sizes.py b/bench/ndarray/jit-reduc-sizes.py new file mode 100644 index 00000000..cf7e0d03 --- /dev/null +++ b/bench/ndarray/jit-reduc-sizes.py @@ -0,0 +1,140 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# This source code is licensed under a BSD-style license (found in the +# LICENSE file in the root directory of this source tree) +####################################################################### + +# Compute reductions for different array sizes, using the jit decorator +# and different operands (NumPy and NDArray). Different compression +# levels and codecs can be selected. For in-memory testing, set ooc to False. + +from time import time +import blosc2 +import numpy as np +import sys + +niter = 5 +#dtype = np.dtype("float32") +dtype = np.dtype("float64") +clevel = 1 +numpy = False +numpy_jit = False +ooc = True # for in-memory testing, set to False + +cparams = cparams_out = None + +# For 64 GB RAM +# sizes_numpy = (1, 5, 10, 20, 30, 35, 40, 45, 50, 55) +# sizes_numpy_jit = (1, 5, 10, 20, 30, 35, 40, 45, 50, 55, 60, 65, 70) +# sizes_clevel0 = (1, 5, 10, 20, 30, 35, 40, 45, 50, 55, 60, 65, 70) +# size_list = (1, 5, 10, 20, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100, 105, 110) # limit clevel>=1 float64 + +# For 24 GB RAM +sizes_numpy = (1, 5, 10, 20, 30, 35, 40) # limit numpy float64 +sizes_numpy_jit = (1, 5, 10, 20, 30, 35, 40, 45) # limit numpy float64 +sizes_clevel0 = (1, 5, 10, 20, 30, 35, 40, 45) # limit clevel==0 float64 +size_list = (1, 5, 10, 20, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90) # limit clevel>=1 float64 + +codec = "LZ4" # default codec +if len(sys.argv) > 2: + codec = sys.argv[2] +if len(sys.argv) > 1: + try: + clevel = int(sys.argv[1]) + except ValueError: + if sys.argv[1] == "numpy": + numpy = True + elif sys.argv[1] == "numpy_jit": + numpy = True + numpy_jit = True + else: + raise ValueError("Invalid argument") + +apath = bpath = None +if numpy: + print("Using NumPy arrays as operands") +else: + print("Using NDArray arrays as operands") + cparams = cparams_out = blosc2.CParams(clevel=clevel, codec=blosc2.Codec[codec]) + # cparams_out = blosc2.CParams(clevel=clevel, codec=blosc2.Codec.LZ4) + print("Using cparams: ", cparams) + if ooc: + apath = "a.b2nd" + bpath = "b.b2nd" + + +# The reductions to compute +def compute_reduction_numpy(a, b, c): + return np.sum(((a ** 3 + np.sin(a * 2)) < c) & (b > 0), axis=1) + +@blosc2.jit +def compute_reduction(a, b, c): + return np.sum(((a ** 3 + np.sin(a * 2)) < c) & (b > 0), axis=1) + + +create_times = [] +compute_times = [] +for n in size_list: + if clevel == 0 and n not in sizes_clevel0: + break + if numpy_jit and n not in sizes_numpy_jit: + break + if numpy and not numpy_jit and n not in sizes_numpy: + break + N = n * 1000 + print(f"\nN = {n}000, {dtype=}, size={N ** 2 * 2 * dtype.itemsize / 2**30 }") + chunks = (100, N) + blocks = (1, N) + chunks, blocks = None, None # automatic chunk and block sizes + # Lossy compression + #filters = [blosc2.Filter.TRUNC_PREC, blosc2.Filter.SHUFFLE] + #filters_meta = [8, 0] # keep 8 bits of precision in mantissa + #cparams = blosc2.CParams(clevel=1, codec=blosc2.Codec.LZ4, filters=filters, filters_meta=filters_meta) + + # Create some data operands + t0 = time() + if numpy: + a = np.linspace(0, 1, N * N, dtype=dtype).reshape(N, N) + b = np.linspace(1, 2, N * N, dtype=dtype).reshape(N, N) + #b = a + 1 + c = np.linspace(-10, 10, N, dtype=dtype) + else: + a = blosc2.linspace(0, 1, N * N, dtype=dtype, shape=(N, N), cparams=cparams, urlpath=apath, mode="w") + print("a.chunks, a.blocks, a.schunk.cratio: ", a.chunks, a.blocks, a.schunk.cratio) + b = blosc2.linspace(1, 2, N * N, dtype=dtype, shape=(N, N), cparams=cparams, urlpath=bpath, mode="w") + #b = (a + 1).compute(cparams=cparams, chunks=chunks, blocks=blocks) + #print(b.chunks, b.blocks, b.schunk.cratio, b.cparams) + c = blosc2.linspace(-10, 10, N, dtype=dtype, cparams=cparams) # broadcasting is supported + #c = blosc2.linspace(-10, 10, N * N, dtype=dtype, shape=(N, N), cparams=cparams) + t1 = time() - t0 + print(f"Time to create data: {t1:.4f}") + create_times.append(t1) + + if numpy: + if numpy_jit: + out = compute_reduction(a, b, c) + t0 = time() + for i in range(niter): + out = compute_reduction(a, b, c) + t1 = (time() - t0) / niter + print(f"Time to compute with numpy_jit and NumPy operands: {t1:.4f}") + else: + t0 = time() + nout = compute_reduction_numpy(a, b, c) + t1 = time() - t0 + print(f"Time to compute with NumPy engine: {t1:.4f}") + else: + out = compute_reduction(a, b, c) + t0 = time() + for i in range(niter): + out = compute_reduction(a, b, c) + t1 = (time() - t0) / niter + print(f"Time to compute with numpy_jit and {clevel=}: {t1:.4f}") + compute_times.append(t1) + del a, b, c + +print("\nCreate times: [", ", ".join([f"{t:.4f}" for t in create_times]), "]") +print("Compute times: [", ", ".join([f"{t:.4f}" for t in compute_times]), "]") +print("End of run!\n\n") diff --git a/bench/ndarray/run-jit-reduc-sizes.sh b/bench/ndarray/run-jit-reduc-sizes.sh new file mode 100644 index 00000000..ad7c766d --- /dev/null +++ b/bench/ndarray/run-jit-reduc-sizes.sh @@ -0,0 +1,5 @@ +/usr/bin/time -v python bench/ndarray/jit-reduc-sizes.py numpy +/usr/bin/time -v python bench/ndarray/jit-reduc-sizes.py numpy_jit +/usr/bin/time -v python bench/ndarray/jit-reduc-sizes.py 0 +/usr/bin/time -v python bench/ndarray/jit-reduc-sizes.py 1 LZ4 +/usr/bin/time -v python bench/ndarray/jit-reduc-sizes.py 1 ZSTD