diff --git a/README.md b/README.md index 90d3623..5d1c529 100644 --- a/README.md +++ b/README.md @@ -62,6 +62,18 @@ res = ilp.fit() with open("output.json", "w") as f: json.dump(res, f, indent=4, cls=JsonEncoder) + +# You can also print the branch and bound tree to visualize the decisions made by the algorithm +# Below you find an example of the tree and meaning of the colors +from exact_kmeans.plot_tree import plot + +plot( + nodes=res["processed_cluster_sizes"], + filename="test", + plot_folder="plots", + optimal_objective=res["objective"], +) + ``` ## Command Line @@ -97,7 +109,7 @@ Your `data-path` should be a file with a header containing only the comma-separa Assuming that you have run the program and stored the results in a JSON file, you can plot the tree produced by the algorithm. ```bash -poetry run python plot_tree.py --output-json output.json --plot-folder plots +poetry run python exact_kmeans.plot_tree.py --output-json output.json --plot-folder plots ``` This will create a tree that looks like this: diff --git a/exact_kmeans/ilp.py b/exact_kmeans/ilp.py index d85b09d..16473b5 100755 --- a/exact_kmeans/ilp.py +++ b/exact_kmeans/ilp.py @@ -106,7 +106,13 @@ def __init__( self.ilp_branching_until_level = 0 if self.config.get("branching_levels", False): - self.ilp_branching_until_level = int(self.config.get("branching_levels")) + # This should be at most k - 2 + # At k we already have the final cluster size that we are testing + # At k - 1 we consider the last cluster size before filling with everything else + # So we only compute the branching ILP until k - 2 + self.ilp_branching_until_level = min( + int(self.config.get("branching_levels")), self.k - 2 + ) logger.info( "Computing ILP on branching cluster sizes " f"when the cluster sizes are less than {self.ilp_branching_until_level}." @@ -714,7 +720,12 @@ def enumerate_sizes( ) # Run the ILP if we have more than one cluster size to # see if we should branch from here - if len(current_cluster_sizes) <= self.ilp_branching_until_level: + # It does not make sense to run it if we only have one value + # Because we have done it before with the other ILP + if ( + len(current_cluster_sizes) > 2 + and len(current_cluster_sizes) <= self.ilp_branching_until_level + ): if self.config.get("fill_cluster_sizes", False): test_sizes = self.fix_rem_cluster_sizes(current_cluster_sizes) else: @@ -737,26 +748,28 @@ def enumerate_sizes( tightest_upper_bound.value, add_remaining_points=True, ) - if not self.config.get("fill_cluster_sizes", False) and isinstance( - found_bound, float - ): - n_fixed_points += search_end - k_fixed += 1 - dp_bound = ( - found_bound - + self.dp_bounds[self.n - n_fixed_points][self.k - k_fixed] - ) - logger.info( - f"Bound for {test_sizes} ({found_bound}) with DP bound ({dp_bound})" - ) - if dp_bound > tightest_upper_bound.value: - logger.info( - f"Bound for {test_sizes} ({found_bound}) " - f"with DP bound ({dp_bound}) " - "is greater than the current upper bound " - f"{tightest_upper_bound.value}, skipping..." - ) - found_bound = "ilp_sum_bound_greater" + + # TODO: This is still not working properly + # if not self.config.get("fill_cluster_sizes", False) and isinstance( + # found_bound, float + # ): + # n_fixed_points += search_end + # k_fixed += 1 + # dp_bound = ( + # found_bound + # + self.dp_bounds[self.n - n_fixed_points][self.k - k_fixed] + # ) + # logger.info( + # f"Bound for {test_sizes} ({found_bound}) with DP bound ({dp_bound})" + # ) + # if dp_bound > tightest_upper_bound.value: + # logger.info( + # f"Bound for {test_sizes} ({found_bound}) " + # f"with DP bound ({dp_bound}) " + # "is greater than the current upper bound " + # f"{tightest_upper_bound.value}, skipping..." + # ) + # found_bound = "ilp_sum_bound_greater" if found_bound not in {"infeasible", "ilp_sum_bound_greater"}: found_bound = "branch" # If the program is feasible and we have less than k clusters @@ -900,6 +913,7 @@ def compute_best_cluster_sizes( best_tmp_obj: Optional[float] = None best_sizes: Optional[np.ndarray] = None + for obj, _, sizes in self.processed_cluster_sizes: if isinstance(obj, float) and (best_tmp_obj is None or obj < best_tmp_obj): best_tmp_obj = obj diff --git a/exact_kmeans/plot_tree.py b/exact_kmeans/plot_tree.py new file mode 100644 index 0000000..caf80ef --- /dev/null +++ b/exact_kmeans/plot_tree.py @@ -0,0 +1,89 @@ +import argparse +import json +from pathlib import Path +from typing import List, Optional + +import graphviz +import numpy as np + + +def plot( + nodes: List, + filename: str, + plot_folder: Path, + optimal_objective: Optional[float] = None, + branch_color: str = "gray", + penwidth: str = "2", +) -> None: + plot_folder = Path(plot_folder) + plot_folder.mkdir(exist_ok=True) + + g = graphviz.Digraph("G", format="png") + g.attr(rankdir="LR") + g.node("root", "[]", style="filled", fillcolor=branch_color, penwidth=penwidth) + nodes.sort(key=lambda x: x[1]) + for obj, time, sizes in nodes: + if obj == "branch": + color = branch_color + elif obj == "sum_bound_greater": + color = "yellow" + elif obj == "ilp_sum_bound_greater": + color = "orange" + elif obj == "infeasible": + color = "red" + elif obj == "k-1": + color = "cyan" + elif isinstance(obj, float): + color = "green" + else: + raise ValueError(f"Unknown object {obj}") + + if ( + optimal_objective is not None + and isinstance(obj, float) + and np.isclose(obj, optimal_objective) + ): + border_color = "gold" + else: + border_color = "black" + + if isinstance(obj, float): + node_content = f"{sizes}\n{time:.3f}s\n({obj:.2f})" + # edge_content = f"{sizes}" + else: + node_content = f"{sizes}\n {time:.3f}s" + + edge_content = "" + + g.node( + f"{sizes}", + node_content, + style="filled", + fillcolor=color, + color=border_color, + penwidth=penwidth, + ) + if len(sizes) == 1: + g.edge("root", f"{sizes}", edge_content) + else: + g.edge(f"{sizes[:-1]}", f"{sizes}", edge_content) + + g.render(filename=plot_folder / f"{filename.replace('.json', '')}", format="png") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + parser.add_argument("--output-json", type=Path, required=True) + parser.add_argument("--plot-folder", type=Path, required=True) + + args = parser.parse_args() + + with open(args.output_json, "r") as f: + jj = json.load(f) + plot( + nodes=jj["processed_cluster_sizes"], + optimal_objective=jj["optimal_objective"], + filename=args.output_json.name, + plot_folder=args.plot_folder, + )