Add plot tree, add comment on how to run it, comment branching-ilp pa…

…rt, do some sanity checks about branching levels
algo-hhu · Jul 23, 2024 · 66dfc87 · 66dfc87
1 parent fcbe8e9
commit 66dfc87
Show file tree

Hide file tree

Showing 3 changed files with 138 additions and 23 deletions.
diff --git a/README.md b/README.md
@@ -62,6 +62,18 @@ res = ilp.fit()
 
 with open("output.json", "w") as f:
     json.dump(res, f, indent=4, cls=JsonEncoder)
+
+# You can also print the branch and bound tree to visualize the decisions made by the algorithm
+# Below you find an example of the tree and meaning of the colors
+from exact_kmeans.plot_tree import plot
+
+plot(
+  nodes=res["processed_cluster_sizes"],
+  filename="test",
+  plot_folder="plots",
+  optimal_objective=res["objective"],
+)
+
 ```
 
 ## Command Line
@@ -97,7 +109,7 @@ Your `data-path` should be a file with a header containing only the comma-separa
 Assuming that you have run the program and stored the results in a JSON file, you can plot the tree produced by the algorithm.
 
 ```bash
-poetry run python plot_tree.py --output-json output.json --plot-folder plots
+poetry run python exact_kmeans.plot_tree.py --output-json output.json --plot-folder plots
 ```
 
 This will create a tree that looks like this:

diff --git a/exact_kmeans/ilp.py b/exact_kmeans/ilp.py
@@ -106,7 +106,13 @@ def __init__(
 
         self.ilp_branching_until_level = 0
         if self.config.get("branching_levels", False):
-            self.ilp_branching_until_level = int(self.config.get("branching_levels"))
+            # This should be at most k - 2
+            # At k we already have the final cluster size that we are testing
+            # At k - 1 we consider the last cluster size before filling with everything else
+            # So we only compute the branching ILP until k - 2
+            self.ilp_branching_until_level = min(
+                int(self.config.get("branching_levels")), self.k - 2
+            )
             logger.info(
                 "Computing ILP on branching cluster sizes "
                 f"when the cluster sizes are less than {self.ilp_branching_until_level}."
@@ -714,7 +720,12 @@ def enumerate_sizes(
                 )
                 # Run the ILP if we have more than one cluster size to
                 # see if we should branch from here
-                if len(current_cluster_sizes) <= self.ilp_branching_until_level:
+                # It does not make sense to run it if we only have one value
+                # Because we have done it before with the other ILP
+                if (
+                    len(current_cluster_sizes) > 2
+                    and len(current_cluster_sizes) <= self.ilp_branching_until_level
+                ):
                     if self.config.get("fill_cluster_sizes", False):
                         test_sizes = self.fix_rem_cluster_sizes(current_cluster_sizes)
                     else:
@@ -737,26 +748,28 @@ def enumerate_sizes(
                             tightest_upper_bound.value,
                             add_remaining_points=True,
                         )
-                    if not self.config.get("fill_cluster_sizes", False) and isinstance(
-                        found_bound, float
-                    ):
-                        n_fixed_points += search_end
-                        k_fixed += 1
-                        dp_bound = (
-                            found_bound
-                            + self.dp_bounds[self.n - n_fixed_points][self.k - k_fixed]
-                        )
-                        logger.info(
-                            f"Bound for {test_sizes} ({found_bound}) with DP bound ({dp_bound})"
-                        )
-                        if dp_bound > tightest_upper_bound.value:
-                            logger.info(
-                                f"Bound for {test_sizes} ({found_bound}) "
-                                f"with DP bound ({dp_bound}) "
-                                "is greater than the current upper bound "
-                                f"{tightest_upper_bound.value}, skipping..."
-                            )
-                            found_bound = "ilp_sum_bound_greater"
+
+                    # TODO: This is still not working properly
+                    # if not self.config.get("fill_cluster_sizes", False) and isinstance(
+                    #     found_bound, float
+                    # ):
+                    #     n_fixed_points += search_end
+                    #     k_fixed += 1
+                    #     dp_bound = (
+                    #         found_bound
+                    #         + self.dp_bounds[self.n - n_fixed_points][self.k - k_fixed]
+                    #     )
+                    #     logger.info(
+                    #     f"Bound for {test_sizes} ({found_bound}) with DP bound ({dp_bound})"
+                    #     )
+                    #     if dp_bound > tightest_upper_bound.value:
+                    #         logger.info(
+                    #             f"Bound for {test_sizes} ({found_bound}) "
+                    #             f"with DP bound ({dp_bound}) "
+                    #             "is greater than the current upper bound "
+                    #             f"{tightest_upper_bound.value}, skipping..."
+                    #         )
+                    #         found_bound = "ilp_sum_bound_greater"
                 if found_bound not in {"infeasible", "ilp_sum_bound_greater"}:
                     found_bound = "branch"
                     # If the program is feasible and we have less than k clusters
@@ -900,6 +913,7 @@ def compute_best_cluster_sizes(
 
         best_tmp_obj: Optional[float] = None
         best_sizes: Optional[np.ndarray] = None
+
         for obj, _, sizes in self.processed_cluster_sizes:
             if isinstance(obj, float) and (best_tmp_obj is None or obj < best_tmp_obj):
                 best_tmp_obj = obj

diff --git a/exact_kmeans/plot_tree.py b/exact_kmeans/plot_tree.py
@@ -0,0 +1,89 @@
+import argparse
+import json
+from pathlib import Path
+from typing import List, Optional
+
+import graphviz
+import numpy as np
+
+
+def plot(
+    nodes: List,
+    filename: str,
+    plot_folder: Path,
+    optimal_objective: Optional[float] = None,
+    branch_color: str = "gray",
+    penwidth: str = "2",
+) -> None:
+    plot_folder = Path(plot_folder)
+    plot_folder.mkdir(exist_ok=True)
+
+    g = graphviz.Digraph("G", format="png")
+    g.attr(rankdir="LR")
+    g.node("root", "[]", style="filled", fillcolor=branch_color, penwidth=penwidth)
+    nodes.sort(key=lambda x: x[1])
+    for obj, time, sizes in nodes:
+        if obj == "branch":
+            color = branch_color
+        elif obj == "sum_bound_greater":
+            color = "yellow"
+        elif obj == "ilp_sum_bound_greater":
+            color = "orange"
+        elif obj == "infeasible":
+            color = "red"
+        elif obj == "k-1":
+            color = "cyan"
+        elif isinstance(obj, float):
+            color = "green"
+        else:
+            raise ValueError(f"Unknown object {obj}")
+
+        if (
+            optimal_objective is not None
+            and isinstance(obj, float)
+            and np.isclose(obj, optimal_objective)
+        ):
+            border_color = "gold"
+        else:
+            border_color = "black"
+
+        if isinstance(obj, float):
+            node_content = f"{sizes}\n{time:.3f}s\n({obj:.2f})"
+            # edge_content = f"{sizes}"
+        else:
+            node_content = f"{sizes}\n {time:.3f}s"
+
+        edge_content = ""
+
+        g.node(
+            f"{sizes}",
+            node_content,
+            style="filled",
+            fillcolor=color,
+            color=border_color,
+            penwidth=penwidth,
+        )
+        if len(sizes) == 1:
+            g.edge("root", f"{sizes}", edge_content)
+        else:
+            g.edge(f"{sizes[:-1]}", f"{sizes}", edge_content)
+
+    g.render(filename=plot_folder / f"{filename.replace('.json', '')}", format="png")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--output-json", type=Path, required=True)
+    parser.add_argument("--plot-folder", type=Path, required=True)
+
+    args = parser.parse_args()
+
+    with open(args.output_json, "r") as f:
+        jj = json.load(f)
+    plot(
+        nodes=jj["processed_cluster_sizes"],
+        optimal_objective=jj["optimal_objective"],
+        filename=args.output_json.name,
+        plot_folder=args.plot_folder,
+    )