Skip to content

Commit 82965d5

Browse files
authored
Add normalize_chunksize and partition utility functions (#47)
1 parent edf4f06 commit 82965d5

File tree

6 files changed

+181
-30
lines changed

6 files changed

+181
-30
lines changed

Diff for: .pre-commit-config.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ repos:
7171
additional_dependencies: [tomli]
7272
files: ^(graphblas_algorithms|docs)/
7373
- repo: https://github.com/charliermarsh/ruff-pre-commit
74-
rev: v0.0.249
74+
rev: v0.0.252
7575
hooks:
7676
- id: ruff
7777
- repo: https://github.com/pre-commit/pre-commit-hooks

Diff for: graphblas_algorithms/nxapi/_utils.py

+127
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
from math import ceil
2+
from numbers import Number
3+
4+
try:
5+
from itertools import pairwise # Added in Python 3.10
6+
except ImportError:
7+
8+
def pairwise(it):
9+
it = iter(it)
10+
for prev in it:
11+
for cur in it:
12+
yield (prev, cur)
13+
prev = cur
14+
15+
16+
BYTES_UNITS = {
17+
"": 1,
18+
"b": 1,
19+
"kb": 1000,
20+
"mb": 1000**2,
21+
"gb": 1000**3,
22+
"tb": 1000**4,
23+
"pb": 1000**5,
24+
"eb": 1000**6,
25+
"zb": 1000**7,
26+
"kib": 1024,
27+
"mib": 1024**2,
28+
"gib": 1024**3,
29+
"tib": 1024**4,
30+
"pib": 1024**5,
31+
"eib": 1024**6,
32+
"zib": 1024**7,
33+
}
34+
35+
36+
def normalize_chunksize(chunksize, itemsize=1, N=None):
37+
if chunksize is None:
38+
return None
39+
if isinstance(chunksize, Number):
40+
rv = int(chunksize)
41+
if rv <= 0 or N is not None and rv >= N:
42+
return None
43+
return rv
44+
if not isinstance(chunksize, str):
45+
raise TypeError(f"chunksize must be a number or a string; got {type(chunksize)}")
46+
chunkstring = chunksize.replace(" ", "").replace("_", "").lower()
47+
if not chunkstring or chunkstring == "all":
48+
return None
49+
for i, c in enumerate(reversed(chunkstring)):
50+
if c.isdigit():
51+
index = len(chunkstring) - i
52+
break
53+
else:
54+
chunkstring = f"1{chunkstring}"
55+
index = 1
56+
57+
prefix = chunkstring[:index]
58+
suffix = chunkstring[index:]
59+
60+
try:
61+
number = float(prefix)
62+
except ValueError as exc:
63+
raise ValueError(
64+
f"Bad chunksize: {chunksize!r}. Could not interpret {prefix!r} as a number."
65+
) from exc
66+
67+
if suffix in {"chunk", "chunks"}:
68+
if number <= 1:
69+
return None
70+
if N is None:
71+
raise TypeError(
72+
f"N argument is required to determine chunksize to split into {int(number)} chunks"
73+
)
74+
rv = ceil(N / number)
75+
else:
76+
scale = BYTES_UNITS.get(suffix)
77+
if scale is None:
78+
raise ValueError(
79+
f"Bad chunksize: {chunksize!r}. Could not interpret {suffix!r} as a bytes unit."
80+
)
81+
number *= scale
82+
if chunkstring[-1] == "b":
83+
number = max(1, number / itemsize)
84+
rv = int(round(number))
85+
if rv <= 0 or N is not None and rv >= N:
86+
return None
87+
return rv
88+
89+
90+
def partition(chunksize, L, *, evenly=True):
91+
"""Partition a list into chunks"""
92+
N = len(L)
93+
if N == 0:
94+
return
95+
chunksize = int(chunksize)
96+
if chunksize <= 0 or chunksize >= N:
97+
yield L
98+
return
99+
if chunksize == 1:
100+
yield from L
101+
return
102+
if evenly:
103+
k = ceil(L / chunksize)
104+
if k * chunksize != N:
105+
yield from split_evenly(k, L)
106+
return
107+
for start, stop in pairwise(range(0, N + chunksize, chunksize)):
108+
yield L[start:stop]
109+
110+
111+
def split_evenly(k, L):
112+
"""Split a list into approximately-equal parts"""
113+
N = len(L)
114+
if N == 0:
115+
return
116+
k = int(k)
117+
if k <= 1:
118+
yield L
119+
return
120+
start = 0
121+
for i in range(1, k):
122+
stop = (N * i + k - 1) // k
123+
if stop != start:
124+
yield L[start:stop]
125+
start = stop
126+
if stop != N:
127+
yield L[stop:]

Diff for: graphblas_algorithms/nxapi/cluster.py

+14-16
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
from graphblas_algorithms.classes.graph import to_undirected_graph
66
from graphblas_algorithms.utils import not_implemented_for
77

8+
from ._utils import normalize_chunksize, partition
9+
810
__all__ = [
911
"triangles",
1012
"transitivity",
@@ -90,11 +92,11 @@ def _split(L, k):
9092

9193

9294
# TODO: should this move into algorithms?
93-
def _square_clustering_split(G, node_ids=None, *, nsplits):
95+
def _square_clustering_split(G, node_ids=None, *, chunksize):
9496
if node_ids is None:
9597
node_ids, _ = G._A.reduce_rowwise(monoid.any).to_coo(values=False)
9698
result = None
97-
for chunk_ids in _split(node_ids, nsplits):
99+
for chunk_ids in partition(chunksize, node_ids):
98100
res = algorithms.square_clustering(G, chunk_ids)
99101
if result is None:
100102
result = res
@@ -103,36 +105,32 @@ def _square_clustering_split(G, node_ids=None, *, nsplits):
103105
return result
104106

105107

106-
def square_clustering(G, nodes=None, *, nsplits="auto"):
107-
# `nsplits` is used to split the computation into chunks.
108+
def square_clustering(G, nodes=None, *, chunksize="256 MiB"):
109+
# `chunksize` is used to split the computation into chunks.
108110
# square_clustering computes `A @ A`, which can get very large, even dense.
109-
# The default `nsplits` is to choose the number so that `Asubset @ A`
111+
# The default `chunksize` is to choose the number so that `Asubset @ A`
110112
# will be about 256 MB if dense.
111113
G = to_undirected_graph(G)
112114
if len(G) == 0:
113115
return {}
114-
if nsplits == "auto":
115-
# TODO: make a utility function for this that can be reused
116-
# Also, should we use `chunksize` instead of `nsplits`?
117-
targetsize = 256 * 1024 * 1024 # 256 MB
118-
nsplits = len(G) ** 2 * G._A.dtype.np_type.itemsize // targetsize
119-
if nsplits <= 1:
120-
nsplits = None
116+
117+
chunksize = normalize_chunksize(chunksize, len(G) * G._A.dtype.np_type.itemsize, len(G))
118+
121119
if nodes is None:
122120
# Should we use this one for subsets of nodes as well?
123-
if nsplits is None:
121+
if chunksize is None:
124122
result = algorithms.square_clustering(G)
125123
else:
126-
result = _square_clustering_split(G, nsplits=nsplits)
124+
result = _square_clustering_split(G, chunksize=chunksize)
127125
return G.vector_to_nodemap(result, fill_value=0)
128126
if nodes in G:
129127
idx = G._key_to_id[nodes]
130128
return algorithms.single_square_clustering(G, idx)
131129
ids = G.list_to_ids(nodes)
132-
if nsplits is None:
130+
if chunksize is None:
133131
result = algorithms.square_clustering(G, ids)
134132
else:
135-
result = _square_clustering_split(G, ids, nsplits=nsplits)
133+
result = _square_clustering_split(G, ids, chunksize=chunksize)
136134
return G.vector_to_nodemap(result)
137135

138136

Diff for: graphblas_algorithms/nxapi/shortest_paths/weighted.py

+5-13
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from graphblas_algorithms import algorithms
22
from graphblas_algorithms.classes.digraph import to_graph
33

4+
from .._utils import normalize_chunksize, partition
45
from ..exception import NetworkXUnbounded, NodeNotFound
56

67
__all__ = [
@@ -9,18 +10,14 @@
910
]
1011

1112

12-
def all_pairs_bellman_ford_path_length(G, weight="weight", *, chunksize="auto"):
13+
def all_pairs_bellman_ford_path_length(G, weight="weight", *, chunksize="10 MiB"):
1314
# Larger chunksize offers more parallelism, but uses more memory.
1415
# Chunksize indicates for how many source nodes to compute at one time.
1516
# The default is to choose the number of rows so the result, if dense,
1617
# will be about 10MB.
1718
G = to_graph(G, weight=weight)
18-
if chunksize == "auto":
19-
# TODO: make a utility function for this that can be reused
20-
targetsize = 10 * 1024 * 1024 # 10 MB
21-
chunksize = max(1, targetsize // (len(G) * G._A.dtype.np_type.itemsize))
22-
23-
if chunksize is None or chunksize <= 0 or chunksize >= len(G):
19+
chunksize = normalize_chunksize(chunksize, len(G) * G._A.dtype.np_type.itemsize, len(G))
20+
if chunksize is None:
2421
# All at once
2522
try:
2623
D = algorithms.bellman_ford_path_lengths(G)
@@ -35,12 +32,7 @@ def all_pairs_bellman_ford_path_length(G, weight="weight", *, chunksize="auto"):
3532
raise NetworkXUnbounded(*e.args) from e
3633
yield (source, G.vector_to_nodemap(d))
3734
else:
38-
# We should probably make a utility function for chunking
39-
nodes = list(G)
40-
for start, stop in zip(
41-
range(0, len(nodes), chunksize), range(chunksize, len(nodes) + chunksize, chunksize)
42-
):
43-
cur_nodes = nodes[start:stop]
35+
for cur_nodes in partition(chunksize, list(G)):
4436
try:
4537
D = algorithms.bellman_ford_path_lengths(G, cur_nodes)
4638
except algorithms.exceptions.Unbounded as e:

Diff for: graphblas_algorithms/nxapi/tests/test_utils.py

+33
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
import pytest
2+
3+
from graphblas_algorithms.nxapi._utils import normalize_chunksize
4+
5+
6+
def test_normalize_chunksize():
7+
assert normalize_chunksize(None) is None
8+
assert normalize_chunksize("all") is None
9+
assert normalize_chunksize("") is None
10+
assert normalize_chunksize(-1) is None
11+
assert normalize_chunksize("-1") is None
12+
assert normalize_chunksize(10, N=10) is None
13+
assert normalize_chunksize("1 MB", N=100) is None
14+
assert normalize_chunksize("1 chunk") is None
15+
assert normalize_chunksize("2 chunks", N=20) == 10
16+
assert normalize_chunksize(10) == 10
17+
assert normalize_chunksize(10.0) == 10
18+
assert normalize_chunksize("10") == 10
19+
assert normalize_chunksize("10.0") == 10
20+
assert normalize_chunksize("1_0 B") == 10
21+
assert normalize_chunksize("1e1") == 10
22+
assert normalize_chunksize("1e-2 kb") == 10
23+
assert normalize_chunksize("Mb") == 1000**2
24+
assert normalize_chunksize(" mb") == 1000**2
25+
assert normalize_chunksize("gib") == 1024**3
26+
with pytest.raises(TypeError, match="chunksize must be"):
27+
normalize_chunksize(object())
28+
with pytest.raises(ValueError, match="as a bytes"):
29+
normalize_chunksize("10 badbytes")
30+
with pytest.raises(ValueError, match="as a number"):
31+
normalize_chunksize("1bad0 TB")
32+
with pytest.raises(TypeError, match="N argument is required"):
33+
normalize_chunksize("10 chunks")

Diff for: pyproject.toml

+1
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,7 @@ ignore = [
199199
"PLR0913", # Too many arguments to function call
200200
"PLR0915", # Too many statements
201201
"PLR2004", # Magic number used in comparison, consider replacing magic with a constant variable
202+
"PLW2901", # Outer for loop variable ... overwritten by inner assignment target (Note: good advice, but too strict)
202203
"RET502", # Do not implicitly `return None` in function able to return non-`None` value
203204
"RET503", # Missing explicit `return` at the end of function able to return non-`None` value
204205
"RET504", # Unnecessary variable assignment before `return` statement

0 commit comments

Comments
 (0)