1919
2020
2121def read_sheet (
22- xlsx_path : Path , sheet_name : str , ws
22+ xlsx_path : Path , sheet_name : str , ws , formula_ws = None
2323) -> list [tuple [RegionKind , pd .DataFrame | str , dict [tuple [int , int ], CellAnnotation ], dict [tuple [int , int ], CellMetadata ], set [str ]]]:
2424 """Read Excel sheet and decompose into tables and prose.
2525
@@ -30,6 +30,12 @@ def read_sheet(
3030 - PROSE: single isolated cell (returned as plain text)
3131 - TABLE: multi-cell region (first row = headers, returned as DataFrame)
3232
33+ Args:
34+ xlsx_path: Path to the Excel file (unused but kept for compatibility)
35+ sheet_name: Name of the sheet being read
36+ ws: Worksheet object with data_only=True
37+ formula_ws: Optional worksheet object with data_only=False for extracting formulas
38+
3339 Returns list of tuples: (kind, content, annotations, metadata, hidden_columns) where:
3440 - kind: RegionKind.PROSE or RegionKind.TABLE
3541 - content: str for prose, pd.DataFrame for table
@@ -39,11 +45,24 @@ def read_sheet(
3945 # Phase 1: Extract raw values and cell objects from worksheet.
4046 # Each row's list only contains cells up to the last non-empty cell in that row,
4147 # so rows may have different lengths and need padding later.
48+ # Collect formulas keyed by (row_idx, col_idx) if formula_ws is provided.
4249 data : list [list [str | int | float | bool | None ]] = []
4350 cell_objects : list [list [Cell ]] = []
44- for row_idx , row in enumerate (ws .iter_rows (values_only = False ), 1 ):
51+ cell_formulas : dict [tuple [int , int ], str ] = {}
52+
53+ if formula_ws :
54+ rows_iter = zip (ws .iter_rows (values_only = False ), formula_ws .iter_rows (values_only = False ))
55+ else :
56+ rows_iter = ((row , None ) for row in ws .iter_rows (values_only = False ))
57+
58+ for row_idx , (row , formula_row ) in enumerate (rows_iter ):
4559 data .append ([cell .value for cell in row ])
46- cell_objects .append (list (row ))
60+ cells = list (row )
61+ if formula_row :
62+ for col_idx , (cell , formula_cell ) in enumerate (zip (cells , formula_row )):
63+ if formula_cell .value and isinstance (formula_cell .value , str ) and formula_cell .value .startswith ("=" ):
64+ cell_formulas [(row_idx , col_idx )] = formula_cell .value
65+ cell_objects .append (cells )
4766
4867 if not data :
4968 return []
@@ -106,8 +125,8 @@ def read_sheet(
106125 data_row = row_idx - region .min_row + 1
107126 data_col = col_idx - region .min_col + 1
108127
109- annotation = CellAnnotation .from_cell (cell )
110- if annotation .fg_color or annotation .bg_color or annotation .border :
128+ annotation = CellAnnotation .from_cell (cell , formula = cell_formulas . get (( row_idx , col_idx )) )
129+ if annotation .fg_color or annotation .bg_color or annotation .border or annotation . formula :
111130 annotations [(data_row , data_col )] = annotation
112131
113132 cell_metadata = CellMetadata .from_cell (cell )
@@ -166,14 +185,16 @@ def excel_to_markdown(
166185 images_dir .mkdir (parents = True , exist_ok = True )
167186
168187 wb = load_workbook (xlsx_path , data_only = True , rich_text = True )
188+ wb_formulas = load_workbook (xlsx_path , data_only = False , rich_text = True )
169189
170190 md_parts : list [str ] = []
171191 chart_counter = 0
172192 img_counter = 0
173193
174194 for sheet_name in wb .sheetnames :
175195 ws = wb [sheet_name ]
176- regions = read_sheet (xlsx_path , sheet_name , ws )
196+ formula_ws = wb_formulas [sheet_name ]
197+ regions = read_sheet (xlsx_path , sheet_name , ws , formula_ws = formula_ws )
177198
178199 md_parts .append (f"# { sheet_name } \n " )
179200
@@ -205,6 +226,8 @@ def excel_to_markdown(
205226 parts .append (f"bg_color={ annotation .bg_color } " )
206227 if annotation .border :
207228 parts .append (f"border={ annotation .border } " )
229+ if annotation .formula :
230+ parts .append (f"formula={ annotation .formula } " )
208231 if parts :
209232 md_parts .append (f"- { range_str } : { ' ' .join (parts )} \n " )
210233
0 commit comments