Skip to content

Commit 8815a01

Browse files
authored
feat:add formula annotation support (#1)
* add function annotation * try another strategy * system * Use pixel equality rather than bytes
1 parent 1e48e7a commit 8815a01

5 files changed

Lines changed: 82 additions & 28 deletions

File tree

.github/workflows/test.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ jobs:
2626
**/pyproject.toml
2727
2828
- name: Install dependencies
29-
run: uv sync --locked --python-preference=system --group dev
29+
run: uv pip install -e ".[dev]" --system
3030

3131
- name: Run tests
3232
run: uv run pytest

tests/test_charts.py

Lines changed: 27 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import re
44
from pathlib import Path
55

6+
import numpy as np
67
import openpyxl
78
from openpyxl.chart import (
89
AreaChart,
@@ -24,6 +25,7 @@
2425
SurfaceChart,
2526
SurfaceChart3D,
2627
)
28+
from PIL import Image
2729

2830
from xldown.converter import render_chart
2931

@@ -328,6 +330,28 @@ def make_surface3d():
328330
return wb, chart
329331

330332

333+
# ---------------------------------------------------------------------------
334+
# Helpers
335+
# ---------------------------------------------------------------------------
336+
337+
338+
def _compare_images(generated_path, expected_path):
339+
"""Compare two PNG images pixel-by-pixel using numpy arrays.
340+
341+
Returns None if images are identical, or an error message if they differ.
342+
"""
343+
generated_array = np.array(Image.open(generated_path))
344+
expected_array = np.array(Image.open(expected_path))
345+
346+
if generated_array.shape != expected_array.shape:
347+
return f"Different dimensions: {generated_array.shape} vs {expected_array.shape}"
348+
349+
if np.any(generated_array != expected_array):
350+
return "Pixel values differ"
351+
352+
return None
353+
354+
331355
# ---------------------------------------------------------------------------
332356
# Parametrized test
333357
# ---------------------------------------------------------------------------
@@ -387,9 +411,6 @@ def test_chart_images_match_fixtures(tmp_path: Path):
387411
result = render_chart(wb, chart, output_path)
388412
assert result, f"{name} ({i}): render_chart returned False"
389413

390-
with open(output_path, "rb") as f:
391-
generated = f.read()
392-
with open(fixtures_dir / f"{i}.png", "rb") as f:
393-
expected = f.read()
394-
395-
assert generated == expected, f"{name} ({i}): Generated image differs from fixture"
414+
fixture_path = fixtures_dir / f"{i}.png"
415+
error = _compare_images(output_path, fixture_path)
416+
assert error is None, f"{name} ({i}): {error}"

tests/test_formatting.py

Lines changed: 21 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -112,30 +112,36 @@ def test_all_formatting_types(tmp_path: Path):
112112
ws["B1"].font = Font(bold=True)
113113
ws["C1"] = "Value"
114114
ws["C1"].font = Font(bold=True)
115+
ws["D1"] = "Double"
116+
ws["D1"].font = Font(bold=True)
115117

116118
# Bold text
117119
ws["A2"] = "Alice"
118120
ws["A2"].font = Font(bold=True)
119121
ws["B2"] = "Active"
120122
ws["C2"] = 100
123+
ws["D2"] = "=SUM($A$1)"
121124

122125
# Italic text
123126
ws["A3"] = "Bob"
124127
ws["A3"].font = Font(italic=True)
125128
ws["B3"] = "Inactive"
126129
ws["C3"] = 50
130+
ws["D3"] = "=SUM($A$1)"
127131

128132
# Strikethrough text
129133
ws["A4"] = "Charlie"
130134
ws["A4"].font = Font(strikethrough=True)
131135
ws["B4"] = "Pending"
132136
ws["C4"] = 75
137+
ws["D4"] = "=SUM($A$1)"
133138

134139
# Bold + Italic
135140
ws["A5"] = "Diana"
136141
ws["A5"].font = Font(bold=True, italic=True)
137142
ws["B5"] = "Active"
138143
ws["C5"] = 200
144+
ws["D5"] = "=SUM($A$1)"
139145

140146
# Red text (annotation) with comment
141147
ws["A6"] = "Eve"
@@ -207,25 +213,27 @@ def test_all_formatting_types(tmp_path: Path):
207213
208214
## Table
209215
210-
| **Name** | **Status** (hidden) | **Value** |
211-
|:-------------------|:----------------------|:------------|
212-
| **Alice** | Active | 100 |
213-
| *Bob* | Inactive | 50 |
214-
| ~~Charlie~~ | Pending | 75 |
215-
| ***Diana*** | Active | 200 |
216-
| Eve | Error | 0 |
217-
| Frank | Warning | 25 |
218-
| <sub>H2O</sub> | Normal | 200 |
219-
| H2O | H<sub>2</sub>O | 100 |
220-
| Rotated 90° (↻90°) | Normal | 300 |
221-
| Grace | OK | 10 |
222-
| left | apex | right |
216+
| **Name** | **Status** (hidden) | **Value** | **Double** |
217+
|:-------------------|:----------------------|:------------|:-------------|
218+
| **Alice** | Active | 100 | |
219+
| *Bob* | Inactive | 50 | |
220+
| ~~Charlie~~ | Pending | 75 | |
221+
| ***Diana*** | Active | 200 | |
222+
| Eve | Error | 0 | |
223+
| Frank | Warning | 25 | |
224+
| <sub>H2O</sub> | Normal | 200 | |
225+
| H2O | H<sub>2</sub>O | 100 | |
226+
| Rotated 90° (↻90°) | Normal | 300 | |
227+
| Grace | OK | 10 | |
228+
| left | apex | right | |
223229
224230
### Annotations
225231
*(Cell references are relative to the table above)*
226232
227233
- B1:B10: bg_color=FFFFFF00
228234
235+
- D2:D5: formula==SUM($A$1)
236+
229237
- A6: fg_color=FFFF0000
230238
231239
- A12:C12: bg_color=FF0000FF

xldown/cells.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -92,11 +92,12 @@ class CellAnnotation(BaseModel):
9292
fg_color: str | None = None
9393
bg_color: str | None = None
9494
border: str | None = None
95+
formula: str | None = None
9596
category: str | None = None
9697

9798
@classmethod
98-
def from_cell(cls, cell: Cell) -> "CellAnnotation":
99-
"""Extract formatting annotations (colors, borders) from a cell."""
99+
def from_cell(cls, cell: Cell, formula: str | None = None) -> "CellAnnotation":
100+
"""Extract formatting annotations (colors, borders, formula) from a cell."""
100101
font = cell.font or {}
101102
fill = cell.fill or {}
102103
border = cell.border or {}
@@ -121,6 +122,7 @@ def from_cell(cls, cell: Cell) -> "CellAnnotation":
121122
fg_color=fg_color,
122123
bg_color=bg_color,
123124
border=border_style,
125+
formula=formula,
124126
)
125127

126128

xldown/converter.py

Lines changed: 29 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919

2020

2121
def read_sheet(
22-
xlsx_path: Path, sheet_name: str, ws
22+
xlsx_path: Path, sheet_name: str, ws, formula_ws=None
2323
) -> list[tuple[RegionKind, pd.DataFrame | str, dict[tuple[int, int], CellAnnotation], dict[tuple[int, int], CellMetadata], set[str]]]:
2424
"""Read Excel sheet and decompose into tables and prose.
2525
@@ -30,6 +30,12 @@ def read_sheet(
3030
- PROSE: single isolated cell (returned as plain text)
3131
- TABLE: multi-cell region (first row = headers, returned as DataFrame)
3232
33+
Args:
34+
xlsx_path: Path to the Excel file (unused but kept for compatibility)
35+
sheet_name: Name of the sheet being read
36+
ws: Worksheet object with data_only=True
37+
formula_ws: Optional worksheet object with data_only=False for extracting formulas
38+
3339
Returns list of tuples: (kind, content, annotations, metadata, hidden_columns) where:
3440
- kind: RegionKind.PROSE or RegionKind.TABLE
3541
- content: str for prose, pd.DataFrame for table
@@ -39,11 +45,24 @@ def read_sheet(
3945
# Phase 1: Extract raw values and cell objects from worksheet.
4046
# Each row's list only contains cells up to the last non-empty cell in that row,
4147
# so rows may have different lengths and need padding later.
48+
# Collect formulas keyed by (row_idx, col_idx) if formula_ws is provided.
4249
data: list[list[str | int | float | bool | None]] = []
4350
cell_objects: list[list[Cell]] = []
44-
for row_idx, row in enumerate(ws.iter_rows(values_only=False), 1):
51+
cell_formulas: dict[tuple[int, int], str] = {}
52+
53+
if formula_ws:
54+
rows_iter = zip(ws.iter_rows(values_only=False), formula_ws.iter_rows(values_only=False))
55+
else:
56+
rows_iter = ((row, None) for row in ws.iter_rows(values_only=False))
57+
58+
for row_idx, (row, formula_row) in enumerate(rows_iter):
4559
data.append([cell.value for cell in row])
46-
cell_objects.append(list(row))
60+
cells = list(row)
61+
if formula_row:
62+
for col_idx, (cell, formula_cell) in enumerate(zip(cells, formula_row)):
63+
if formula_cell.value and isinstance(formula_cell.value, str) and formula_cell.value.startswith("="):
64+
cell_formulas[(row_idx, col_idx)] = formula_cell.value
65+
cell_objects.append(cells)
4766

4867
if not data:
4968
return []
@@ -106,8 +125,8 @@ def read_sheet(
106125
data_row = row_idx - region.min_row + 1
107126
data_col = col_idx - region.min_col + 1
108127

109-
annotation = CellAnnotation.from_cell(cell)
110-
if annotation.fg_color or annotation.bg_color or annotation.border:
128+
annotation = CellAnnotation.from_cell(cell, formula=cell_formulas.get((row_idx, col_idx)))
129+
if annotation.fg_color or annotation.bg_color or annotation.border or annotation.formula:
111130
annotations[(data_row, data_col)] = annotation
112131

113132
cell_metadata = CellMetadata.from_cell(cell)
@@ -166,14 +185,16 @@ def excel_to_markdown(
166185
images_dir.mkdir(parents=True, exist_ok=True)
167186

168187
wb = load_workbook(xlsx_path, data_only=True, rich_text=True)
188+
wb_formulas = load_workbook(xlsx_path, data_only=False, rich_text=True)
169189

170190
md_parts: list[str] = []
171191
chart_counter = 0
172192
img_counter = 0
173193

174194
for sheet_name in wb.sheetnames:
175195
ws = wb[sheet_name]
176-
regions = read_sheet(xlsx_path, sheet_name, ws)
196+
formula_ws = wb_formulas[sheet_name]
197+
regions = read_sheet(xlsx_path, sheet_name, ws, formula_ws=formula_ws)
177198

178199
md_parts.append(f"# {sheet_name}\n")
179200

@@ -205,6 +226,8 @@ def excel_to_markdown(
205226
parts.append(f"bg_color={annotation.bg_color}")
206227
if annotation.border:
207228
parts.append(f"border={annotation.border}")
229+
if annotation.formula:
230+
parts.append(f"formula={annotation.formula}")
208231
if parts:
209232
md_parts.append(f"- {range_str}: {' '.join(parts)}\n")
210233

0 commit comments

Comments
 (0)