Skip to content

Commit

Permalink
Add plot tree, add comment on how to run it, comment branching-ilp pa…
Browse files Browse the repository at this point in the history
…rt, do some sanity checks about branching levels
  • Loading branch information
giuliabaldini committed Jul 23, 2024
1 parent fcbe8e9 commit 66dfc87
Show file tree
Hide file tree
Showing 3 changed files with 138 additions and 23 deletions.
14 changes: 13 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,18 @@ res = ilp.fit()

with open("output.json", "w") as f:
json.dump(res, f, indent=4, cls=JsonEncoder)

# You can also print the branch and bound tree to visualize the decisions made by the algorithm
# Below you find an example of the tree and meaning of the colors
from exact_kmeans.plot_tree import plot

plot(
nodes=res["processed_cluster_sizes"],
filename="test",
plot_folder="plots",
optimal_objective=res["objective"],
)

```

## Command Line
Expand Down Expand Up @@ -97,7 +109,7 @@ Your `data-path` should be a file with a header containing only the comma-separa
Assuming that you have run the program and stored the results in a JSON file, you can plot the tree produced by the algorithm.

```bash
poetry run python plot_tree.py --output-json output.json --plot-folder plots
poetry run python exact_kmeans.plot_tree.py --output-json output.json --plot-folder plots
```

This will create a tree that looks like this:
Expand Down
58 changes: 36 additions & 22 deletions exact_kmeans/ilp.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,13 @@ def __init__(

self.ilp_branching_until_level = 0
if self.config.get("branching_levels", False):
self.ilp_branching_until_level = int(self.config.get("branching_levels"))
# This should be at most k - 2
# At k we already have the final cluster size that we are testing
# At k - 1 we consider the last cluster size before filling with everything else
# So we only compute the branching ILP until k - 2
self.ilp_branching_until_level = min(
int(self.config.get("branching_levels")), self.k - 2
)
logger.info(
"Computing ILP on branching cluster sizes "
f"when the cluster sizes are less than {self.ilp_branching_until_level}."
Expand Down Expand Up @@ -714,7 +720,12 @@ def enumerate_sizes(
)
# Run the ILP if we have more than one cluster size to
# see if we should branch from here
if len(current_cluster_sizes) <= self.ilp_branching_until_level:
# It does not make sense to run it if we only have one value
# Because we have done it before with the other ILP
if (
len(current_cluster_sizes) > 2
and len(current_cluster_sizes) <= self.ilp_branching_until_level
):
if self.config.get("fill_cluster_sizes", False):
test_sizes = self.fix_rem_cluster_sizes(current_cluster_sizes)
else:
Expand All @@ -737,26 +748,28 @@ def enumerate_sizes(
tightest_upper_bound.value,
add_remaining_points=True,
)
if not self.config.get("fill_cluster_sizes", False) and isinstance(
found_bound, float
):
n_fixed_points += search_end
k_fixed += 1
dp_bound = (
found_bound
+ self.dp_bounds[self.n - n_fixed_points][self.k - k_fixed]
)
logger.info(
f"Bound for {test_sizes} ({found_bound}) with DP bound ({dp_bound})"
)
if dp_bound > tightest_upper_bound.value:
logger.info(
f"Bound for {test_sizes} ({found_bound}) "
f"with DP bound ({dp_bound}) "
"is greater than the current upper bound "
f"{tightest_upper_bound.value}, skipping..."
)
found_bound = "ilp_sum_bound_greater"

# TODO: This is still not working properly
# if not self.config.get("fill_cluster_sizes", False) and isinstance(
# found_bound, float
# ):
# n_fixed_points += search_end
# k_fixed += 1
# dp_bound = (
# found_bound
# + self.dp_bounds[self.n - n_fixed_points][self.k - k_fixed]
# )
# logger.info(
# f"Bound for {test_sizes} ({found_bound}) with DP bound ({dp_bound})"
# )
# if dp_bound > tightest_upper_bound.value:
# logger.info(
# f"Bound for {test_sizes} ({found_bound}) "
# f"with DP bound ({dp_bound}) "
# "is greater than the current upper bound "
# f"{tightest_upper_bound.value}, skipping..."
# )
# found_bound = "ilp_sum_bound_greater"
if found_bound not in {"infeasible", "ilp_sum_bound_greater"}:
found_bound = "branch"
# If the program is feasible and we have less than k clusters
Expand Down Expand Up @@ -900,6 +913,7 @@ def compute_best_cluster_sizes(

best_tmp_obj: Optional[float] = None
best_sizes: Optional[np.ndarray] = None

for obj, _, sizes in self.processed_cluster_sizes:
if isinstance(obj, float) and (best_tmp_obj is None or obj < best_tmp_obj):
best_tmp_obj = obj
Expand Down
89 changes: 89 additions & 0 deletions exact_kmeans/plot_tree.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
import argparse
import json
from pathlib import Path
from typing import List, Optional

import graphviz
import numpy as np


def plot(
nodes: List,
filename: str,
plot_folder: Path,
optimal_objective: Optional[float] = None,
branch_color: str = "gray",
penwidth: str = "2",
) -> None:
plot_folder = Path(plot_folder)
plot_folder.mkdir(exist_ok=True)

g = graphviz.Digraph("G", format="png")
g.attr(rankdir="LR")
g.node("root", "[]", style="filled", fillcolor=branch_color, penwidth=penwidth)
nodes.sort(key=lambda x: x[1])
for obj, time, sizes in nodes:
if obj == "branch":
color = branch_color
elif obj == "sum_bound_greater":
color = "yellow"
elif obj == "ilp_sum_bound_greater":
color = "orange"
elif obj == "infeasible":
color = "red"
elif obj == "k-1":
color = "cyan"
elif isinstance(obj, float):
color = "green"
else:
raise ValueError(f"Unknown object {obj}")

if (
optimal_objective is not None
and isinstance(obj, float)
and np.isclose(obj, optimal_objective)
):
border_color = "gold"
else:
border_color = "black"

if isinstance(obj, float):
node_content = f"{sizes}\n{time:.3f}s\n({obj:.2f})"
# edge_content = f"{sizes}"
else:
node_content = f"{sizes}\n {time:.3f}s"

edge_content = ""

g.node(
f"{sizes}",
node_content,
style="filled",
fillcolor=color,
color=border_color,
penwidth=penwidth,
)
if len(sizes) == 1:
g.edge("root", f"{sizes}", edge_content)
else:
g.edge(f"{sizes[:-1]}", f"{sizes}", edge_content)

g.render(filename=plot_folder / f"{filename.replace('.json', '')}", format="png")


if __name__ == "__main__":
parser = argparse.ArgumentParser()

parser.add_argument("--output-json", type=Path, required=True)
parser.add_argument("--plot-folder", type=Path, required=True)

args = parser.parse_args()

with open(args.output_json, "r") as f:
jj = json.load(f)
plot(
nodes=jj["processed_cluster_sizes"],
optimal_objective=jj["optimal_objective"],
filename=args.output_json.name,
plot_folder=args.plot_folder,
)

0 comments on commit 66dfc87

Please sign in to comment.