feat:add formula annotation support (#1)

bpblanken · web-flow · commit 8815a01d4ea5 · 2026-05-06T14:43:29.000-06:00
* add function annotation

* try another strategy

* system

* Use pixel equality rather than bytes
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -26,7 +26,7 @@ jobs:
             **/pyproject.toml
 
       - name: Install dependencies
-        run: uv sync --locked --python-preference=system --group dev
+        run: uv pip install -e ".[dev]" --system
 
       - name: Run tests
         run: uv run pytest
diff --git a/tests/test_charts.py b/tests/test_charts.py
@@ -3,6 +3,7 @@
 import re
 from pathlib import Path
 
+import numpy as np
 import openpyxl
 from openpyxl.chart import (
     AreaChart,
@@ -24,6 +25,7 @@
     SurfaceChart,
     SurfaceChart3D,
 )
+from PIL import Image
 
 from xldown.converter import render_chart
 
@@ -328,6 +330,28 @@ def make_surface3d():
     return wb, chart
 
 
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _compare_images(generated_path, expected_path):
+    """Compare two PNG images pixel-by-pixel using numpy arrays.
+
+    Returns None if images are identical, or an error message if they differ.
+    """
+    generated_array = np.array(Image.open(generated_path))
+    expected_array = np.array(Image.open(expected_path))
+
+    if generated_array.shape != expected_array.shape:
+        return f"Different dimensions: {generated_array.shape} vs {expected_array.shape}"
+
+    if np.any(generated_array != expected_array):
+        return "Pixel values differ"
+
+    return None
+
+
 # ---------------------------------------------------------------------------
 # Parametrized test
 # ---------------------------------------------------------------------------
@@ -387,9 +411,6 @@ def test_chart_images_match_fixtures(tmp_path: Path):
         result = render_chart(wb, chart, output_path)
         assert result, f"{name} ({i}): render_chart returned False"
 
-        with open(output_path, "rb") as f:
-            generated = f.read()
-        with open(fixtures_dir / f"{i}.png", "rb") as f:
-            expected = f.read()
-
-        assert generated == expected, f"{name} ({i}): Generated image differs from fixture"
+        fixture_path = fixtures_dir / f"{i}.png"
+        error = _compare_images(output_path, fixture_path)
+        assert error is None, f"{name} ({i}): {error}"
diff --git a/tests/test_formatting.py b/tests/test_formatting.py
@@ -112,30 +112,36 @@ def test_all_formatting_types(tmp_path: Path):
     ws["B1"].font = Font(bold=True)
     ws["C1"] = "Value"
     ws["C1"].font = Font(bold=True)
+    ws["D1"] = "Double"
+    ws["D1"].font = Font(bold=True)
 
     # Bold text
     ws["A2"] = "Alice"
     ws["A2"].font = Font(bold=True)
     ws["B2"] = "Active"
     ws["C2"] = 100
+    ws["D2"] = "=SUM($A$1)"
 
     # Italic text
     ws["A3"] = "Bob"
     ws["A3"].font = Font(italic=True)
     ws["B3"] = "Inactive"
     ws["C3"] = 50
+    ws["D3"] = "=SUM($A$1)"
 
     # Strikethrough text
     ws["A4"] = "Charlie"
     ws["A4"].font = Font(strikethrough=True)
     ws["B4"] = "Pending"
     ws["C4"] = 75
+    ws["D4"] = "=SUM($A$1)"
 
     # Bold + Italic
     ws["A5"] = "Diana"
     ws["A5"].font = Font(bold=True, italic=True)
     ws["B5"] = "Active"
     ws["C5"] = 200
+    ws["D5"] = "=SUM($A$1)"
 
     # Red text (annotation) with comment
     ws["A6"] = "Eve"
@@ -207,25 +213,27 @@ def test_all_formatting_types(tmp_path: Path):
 
 ## Table
 
-| **Name**           | **Status** (hidden)   | **Value**   |
-|:-------------------|:----------------------|:------------|
-| **Alice**          | Active                | 100         |
-| *Bob*              | Inactive              | 50          |
-| ~~Charlie~~        | Pending               | 75          |
-| ***Diana***        | Active                | 200         |
-| Eve                | Error                 | 0           |
-| Frank              | Warning               | 25          |
-| <sub>H2O</sub>     | Normal                | 200         |
-| H2O                | H<sub>2</sub>O        | 100         |
-| Rotated 90° (↻90°) | Normal                | 300         |
-| Grace              | OK                    | 10          |
-| left               | apex                  | right       |
+| **Name**           | **Status** (hidden)   | **Value**   | **Double**   |
+|:-------------------|:----------------------|:------------|:-------------|
+| **Alice**          | Active                | 100         |              |
+| *Bob*              | Inactive              | 50          |              |
+| ~~Charlie~~        | Pending               | 75          |              |
+| ***Diana***        | Active                | 200         |              |
+| Eve                | Error                 | 0           |              |
+| Frank              | Warning               | 25          |              |
+| <sub>H2O</sub>     | Normal                | 200         |              |
+| H2O                | H<sub>2</sub>O        | 100         |              |
+| Rotated 90° (↻90°) | Normal                | 300         |              |
+| Grace              | OK                    | 10          |              |
+| left               | apex                  | right       |              |
 
 ### Annotations
 *(Cell references are relative to the table above)*
 
 - B1:B10: bg_color=FFFFFF00
 
+- D2:D5: formula==SUM($A$1)
+
 - A6: fg_color=FFFF0000
 
 - A12:C12: bg_color=FF0000FF
diff --git a/xldown/cells.py b/xldown/cells.py
@@ -92,11 +92,12 @@ class CellAnnotation(BaseModel):
     fg_color: str | None = None
     bg_color: str | None = None
     border: str | None = None
+    formula: str | None = None
     category: str | None = None
 
     @classmethod
-    def from_cell(cls, cell: Cell) -> "CellAnnotation":
-        """Extract formatting annotations (colors, borders) from a cell."""
+    def from_cell(cls, cell: Cell, formula: str | None = None) -> "CellAnnotation":
+        """Extract formatting annotations (colors, borders, formula) from a cell."""
         font = cell.font or {}
         fill = cell.fill or {}
         border = cell.border or {}
@@ -121,6 +122,7 @@ def from_cell(cls, cell: Cell) -> "CellAnnotation":
             fg_color=fg_color,
             bg_color=bg_color,
             border=border_style,
+            formula=formula,
         )
 
 
diff --git a/xldown/converter.py b/xldown/converter.py
@@ -19,7 +19,7 @@
 
 
 def read_sheet(
-    xlsx_path: Path, sheet_name: str, ws
+    xlsx_path: Path, sheet_name: str, ws, formula_ws=None
 ) -> list[tuple[RegionKind, pd.DataFrame | str, dict[tuple[int, int], CellAnnotation], dict[tuple[int, int], CellMetadata], set[str]]]:
     """Read Excel sheet and decompose into tables and prose.
 
@@ -30,6 +30,12 @@ def read_sheet(
     - PROSE: single isolated cell (returned as plain text)
     - TABLE: multi-cell region (first row = headers, returned as DataFrame)
 
+    Args:
+        xlsx_path: Path to the Excel file (unused but kept for compatibility)
+        sheet_name: Name of the sheet being read
+        ws: Worksheet object with data_only=True
+        formula_ws: Optional worksheet object with data_only=False for extracting formulas
+
     Returns list of tuples: (kind, content, annotations, metadata, hidden_columns) where:
     - kind: RegionKind.PROSE or RegionKind.TABLE
     - content: str for prose, pd.DataFrame for table
@@ -39,11 +45,24 @@ def read_sheet(
     # Phase 1: Extract raw values and cell objects from worksheet.
     # Each row's list only contains cells up to the last non-empty cell in that row,
     # so rows may have different lengths and need padding later.
+    # Collect formulas keyed by (row_idx, col_idx) if formula_ws is provided.
     data: list[list[str | int | float | bool | None]] = []
     cell_objects: list[list[Cell]] = []
-    for row_idx, row in enumerate(ws.iter_rows(values_only=False), 1):
+    cell_formulas: dict[tuple[int, int], str] = {}
+
+    if formula_ws:
+        rows_iter = zip(ws.iter_rows(values_only=False), formula_ws.iter_rows(values_only=False))
+    else:
+        rows_iter = ((row, None) for row in ws.iter_rows(values_only=False))
+
+    for row_idx, (row, formula_row) in enumerate(rows_iter):
         data.append([cell.value for cell in row])
-        cell_objects.append(list(row))
+        cells = list(row)
+        if formula_row:
+            for col_idx, (cell, formula_cell) in enumerate(zip(cells, formula_row)):
+                if formula_cell.value and isinstance(formula_cell.value, str) and formula_cell.value.startswith("="):
+                    cell_formulas[(row_idx, col_idx)] = formula_cell.value
+        cell_objects.append(cells)
 
     if not data:
         return []
@@ -106,8 +125,8 @@ def read_sheet(
                         data_row = row_idx - region.min_row + 1
                         data_col = col_idx - region.min_col + 1
 
-                        annotation = CellAnnotation.from_cell(cell)
-                        if annotation.fg_color or annotation.bg_color or annotation.border:
+                        annotation = CellAnnotation.from_cell(cell, formula=cell_formulas.get((row_idx, col_idx)))
+                        if annotation.fg_color or annotation.bg_color or annotation.border or annotation.formula:
                             annotations[(data_row, data_col)] = annotation
 
                         cell_metadata = CellMetadata.from_cell(cell)
@@ -166,14 +185,16 @@ def excel_to_markdown(
     images_dir.mkdir(parents=True, exist_ok=True)
 
     wb = load_workbook(xlsx_path, data_only=True, rich_text=True)
+    wb_formulas = load_workbook(xlsx_path, data_only=False, rich_text=True)
 
     md_parts: list[str] = []
     chart_counter = 0
     img_counter = 0
 
     for sheet_name in wb.sheetnames:
         ws = wb[sheet_name]
-        regions = read_sheet(xlsx_path, sheet_name, ws)
+        formula_ws = wb_formulas[sheet_name]
+        regions = read_sheet(xlsx_path, sheet_name, ws, formula_ws=formula_ws)
 
         md_parts.append(f"# {sheet_name}\n")
 
@@ -205,6 +226,8 @@ def excel_to_markdown(
                         parts.append(f"bg_color={annotation.bg_color}")
                     if annotation.border:
                         parts.append(f"border={annotation.border}")
+                    if annotation.formula:
+                        parts.append(f"formula={annotation.formula}")
                     if parts:
                         md_parts.append(f"- {range_str}: {' '.join(parts)}\n")