From 34a7dc4f0207203d311a7aaa02d4bfefc7e63af7 Mon Sep 17 00:00:00 2001
From: Sebastian Husch Lee <sjrl423@gmail.com>
Date: Fri, 7 Feb 2025 15:05:30 +0100
Subject: [PATCH] Docstring updates

---
 .../preprocessors/csv_document_splitter.py    | 26 +++++++------------
 1 file changed, 9 insertions(+), 17 deletions(-)

diff --git a/haystack/components/preprocessors/csv_document_splitter.py b/haystack/components/preprocessors/csv_document_splitter.py
index 90d4fd9519..fd687dbabb 100644
--- a/haystack/components/preprocessors/csv_document_splitter.py
+++ b/haystack/components/preprocessors/csv_document_splitter.py
@@ -17,7 +17,10 @@
 @component
 class CSVDocumentSplitter:
     """
-    A component for splitting CSV documents
+    A component for splitting CSV documents into sub-tables based on empty rows and columns.
+
+    The splitter identifies consecutive empty rows or columns that exceed a given threshold
+    and uses them as delimiters to segment the document into smaller tables.
     """
 
     def __init__(self, row_split_threshold: Optional[int] = 2, column_split_threshold: Optional[int] = 2) -> None:
@@ -26,12 +29,8 @@ def __init__(self, row_split_threshold: Optional[int] = 2, column_split_threshol
 
         :param row_split_threshold:
             The minimum number of consecutive empty rows required to trigger a split.
-            A higher threshold prevents excessive splitting, while a lower threshold may lead
-            to more fragmented sub-tables.
         :param column_split_threshold:
             The minimum number of consecutive empty columns required to trigger a split.
-            A higher threshold prevents excessive splitting, while a lower threshold may lead
-            to more fragmented sub-tables.
         """
         pandas_import.check()
         if row_split_threshold is not None and row_split_threshold < 1:
@@ -52,10 +51,11 @@ def run(self, documents: List[Document]) -> Dict[str, List[Document]]:
         Processes and splits a list of CSV documents into multiple sub-tables.
 
         **Splitting Process:**
-        1. Row Splitting: Detects empty rows and separates tables stacked vertically.
-        2. Column Splitting: Detects empty columns and separates side-by-side tables.
-        3. Recursive Row Check: After splitting by columns, it checks for new row splits
-           introduced by the column split.
+        1. Applies a row-based split if `row_split_threshold` is provided.
+        2. Applies a column-based split if `column_split_threshold` is provided.
+        3. If both thresholds are specified, performs a recursive split by rows first, then columns, ensuring
+           further fragmentation of any sub-tables that still contain empty sections.
+        4. Sorts the resulting sub-tables based on their original positions within the document.
 
         :param documents: A list of Documents containing CSV-formatted content.
             Each document is assumed to contain one or more tables separated by empty rows or columns.
@@ -206,14 +206,6 @@ def _recursive_split(
         :param df: A Pandas DataFrame representing a table (or multiple tables) extracted from a CSV.
         :param row_split_threshold: The minimum number of consecutive empty rows required to trigger a split.
         :param column_split_threshold: The minimum number of consecutive empty columns to trigger a split.
-
-        **Splitting Process:**
-        1. Row Splitting: Detects empty rows and separates tables stacked vertically.
-        2. Column Splitting: Detects empty columns and separates side-by-side tables.
-        3. Recursive Row Check: After splitting by columns, it checks for new row splits
-           introduced by the column split.
-
-        Termination Condition: If no further splits are detected, the recursion stops.
         """
 
         # Step 1: Split by rows