Add SegmentBoundaryAdjuster + tests, and use it to adjust the placement of paragraph markers.

Ben King · Ben King · commit 00312e025d0c · 2026-05-13T17:05:55.000-04:00
diff --git a/machine/corpora/place_markers_usfm_update_block_handler.py b/machine/corpora/place_markers_usfm_update_block_handler.py
@@ -3,6 +3,7 @@
 from typing import List, TypedDict, cast
 
 from ..translation.word_alignment_matrix import WordAlignmentMatrix
+from .segment_boundary_adjuster import SegmentBoundaryAdjuster
 from .update_usfm_parser_handler import UpdateUsfmMarkerBehavior
 from .usfm_token import UsfmToken, UsfmTokenType
 from .usfm_update_block import UsfmUpdateBlock
@@ -137,6 +138,10 @@ def process_block(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock:
         for element, adj_src_tok in zip(to_place, adj_src_toks):
             adj_trg_tok = self._predict_marker_location(alignment_info["alignment"], adj_src_tok, src_toks, trg_toks)
 
+            # If inserting a paragraph marker, make small adjustments to place it in a more natural location
+            if element.type == UsfmUpdateBlockElementType.PARAGRAPH:
+                adj_trg_tok = SegmentBoundaryAdjuster().adjust_tokenized_segment_pair_boundaries(adj_trg_tok, trg_toks)
+
             if (
                 adj_trg_tok > 0
                 and element.type == UsfmUpdateBlockElementType.STYLE
diff --git a/machine/corpora/segment_boundary_adjuster.py b/machine/corpora/segment_boundary_adjuster.py
@@ -0,0 +1,121 @@
+from typing import List, Set, Tuple
+
+import regex
+
+
+# This class is used by SegmentBoundaryAdjuster when it is dealing with tokenized text.
+class TokenRejoiner:
+
+    _NO_TRAILING_SPACE_CHARACTERS: Set[str] = {"(", "[", "{", "«", "‹", "“", "‘"}
+    _NO_LEADING_SPACE_CHARACTERS: Set[str] = {",", ";", ":", ".", "!", "?", ")", "]", "}", "”", "’", "»", "›"}
+
+    def __init__(self) -> None:
+        self._joined_text = ""
+        self._num_tokens = 0
+
+    @classmethod
+    def join_tokens(cls, tokens: List[str]) -> str:
+        rejoiner = cls()
+        for token in tokens:
+            rejoiner.add_token_to_joined_text(token)
+        if len(rejoiner._joined_text) > 0 and rejoiner._joined_text[-1] not in cls._NO_TRAILING_SPACE_CHARACTERS:
+            rejoiner._joined_text += " "
+        return rejoiner._joined_text
+
+    def add_token_to_joined_text(self, token: str) -> str:
+        if self._num_tokens > 0:
+            if (
+                token not in self._NO_LEADING_SPACE_CHARACTERS
+                and self._joined_text[-1] not in self._NO_TRAILING_SPACE_CHARACTERS
+            ):
+                self._joined_text += " "
+        self._joined_text += token
+        self._num_tokens += 1
+        return self._joined_text
+
+
+class SegmentBoundaryAdjuster:
+    _PROHIBITED_VERSE_STARTING_CHARACTERS: Set[str] = {
+        " ",
+        ",",
+        ";",
+        ":",
+        ".",
+        "!",
+        "?",
+        ")",
+        "]",
+        "}",
+        "”",
+        "’",
+    }
+    _PROHIBITED_VERSE_ENDING_CHARACTERS: Set[str] = {"(", "[", "{", "«", "‹", "“", "‘"}
+    _PUNCTUATION_AND_SENTENCE_STARTING_PATTERN = regex.compile(r".*([^\w\s]\s*)(\p{Lu}\w+(\s+\w+)?(\s+\w+)?\s*)$")
+    _WORDS_AND_SENTENCE_ENDING_PATTERN = regex.compile(r"^(\p{Ll}\w+(\s+\w+)?(\s+\w+)?)([\.,;:!\?\)\]”’]\s*[”’]*\s*)")
+
+    def adjust_segment_boundaries(self, verses: List[str]) -> List[str]:
+        for i in range(len(verses) - 1):
+            verses[i], verses[i + 1] = self.adjust_segment_pair_boundary(verses[i], verses[i + 1])
+        return verses
+
+    def adjust_segment_pair_boundary(self, segment: str, next_segment: str) -> Tuple[str, str]:
+        while len(next_segment) > 0 and next_segment[0] in self._PROHIBITED_VERSE_STARTING_CHARACTERS:
+            segment += next_segment[0]
+            next_segment = next_segment[1:]
+        while len(segment) > 0 and segment[-1] in self._PROHIBITED_VERSE_ENDING_CHARACTERS:
+            next_segment = segment[-1] + next_segment
+            segment = segment[:-1]
+        if self._segment_ends_with_start_of_sentence(segment):
+            segment, next_segment = self._adjust_for_missed_sentence_start(segment, next_segment)
+        if self._segment_starts_with_end_of_sentence(next_segment):
+            segment, next_segment = self._adjust_for_late_sentence_end(segment, next_segment)
+        return segment, next_segment
+
+    def _segment_ends_with_start_of_sentence(self, segment: str) -> bool:
+        return self._PUNCTUATION_AND_SENTENCE_STARTING_PATTERN.match(segment) is not None
+
+    def _adjust_for_missed_sentence_start(self, segment: str, next_segment: str) -> Tuple[str, str]:
+        match = self._PUNCTUATION_AND_SENTENCE_STARTING_PATTERN.match(segment)
+        if match is not None:
+            capitalized_word = match.group(2)
+            segment = segment[: match.end(1)]
+            next_segment = capitalized_word + ("" if capitalized_word[-1] == " " else " ") + next_segment
+        return segment, next_segment
+
+    def _segment_starts_with_end_of_sentence(self, segment: str) -> bool:
+        return self._WORDS_AND_SENTENCE_ENDING_PATTERN.match(segment) is not None
+
+    def _adjust_for_late_sentence_end(self, segment: str, next_segment: str) -> Tuple[str, str]:
+        match = self._WORDS_AND_SENTENCE_ENDING_PATTERN.match(next_segment)
+        if match is not None:
+            words = match.group(1)
+            punctuation = match.group(4)
+            segment = segment + words + punctuation
+            next_segment = next_segment[match.end(0) :]
+        return segment, next_segment
+
+    def adjust_tokenized_segment_pair_boundaries(self, segment_boundary: int, tokens: List[str]) -> int:
+        segment_text = TokenRejoiner.join_tokens(tokens[:segment_boundary])
+        next_segment_text = TokenRejoiner.join_tokens(tokens[segment_boundary:])
+        adjusted_segment_text = self.adjust_segment_pair_boundary(segment_text, next_segment_text)[0].strip()
+
+        return self._find_best_boundary_from_segment_length(tokens, len(adjusted_segment_text))
+
+    def _find_best_boundary_from_segment_length(self, tokens: List[str], target_segment_length: int) -> int:
+        token_rejoiner = TokenRejoiner()
+
+        for index, token in enumerate(tokens):
+            accumulated_length = len(token_rejoiner.add_token_to_joined_text(token))
+
+            if accumulated_length >= target_segment_length:
+                # In the unlikely case that the adjusted boundary falls in the middle of a token
+                # select the token boundary that is closest
+                error_with_current_boundary = accumulated_length - target_segment_length
+                error_with_previous_boundary = target_segment_length - (accumulated_length - len(token))
+
+                if error_with_current_boundary < error_with_previous_boundary:
+                    return index + 1
+                else:
+                    return index
+
+        return len(tokens)
diff --git a/tests/corpora/test_place_markers_usfm_update_block_handler.py b/tests/corpora/test_place_markers_usfm_update_block_handler.py
@@ -692,6 +692,43 @@ def test_support_verse_zero():
     assert_usfm_equals(target, result)
 
 
+def test_adjustment_of_placed_paragraph_marker() -> None:
+    source = "This is the first paragraph. This text is in English and this test is for paragraph markers."
+    pretranslation = (
+        "Este es el primer párrafo. Este texto está en inglés, y esta prueba es para marcadores de párrafo."
+    )
+    align_info = PlaceMarkersAlignmentInfo(
+        source_tokens=[t for t in TOKENIZER.tokenize(source)],
+        translation_tokens=[t for t in TOKENIZER.tokenize(pretranslation)],
+        alignment=to_word_alignment_matrix(
+            "0-0 1-1 2-2 3-3 4-4 5-5 6-6 7-7 8-8 9-9 10-10 11-11 11-12 12-13 13-14 14-15 15-16 16-19 17-17 18-20"
+        ),
+        paragraph_behavior=UpdateUsfmMarkerBehavior.PRESERVE,
+        style_behavior=UpdateUsfmMarkerBehavior.STRIP,
+    )
+    rows = [UpdateUsfmRow(scr_ref("MAT 1:1"), str(pretranslation), {"alignment_info": align_info})]
+    usfm = r"""\id MAT
+\c 1
+\v 1 This is the first paragraph.
+\p This text is in English
+\p and this test is for paragraph markers.
+"""
+
+    target = update_usfm(
+        rows,
+        usfm,
+        paragraph_behavior=UpdateUsfmMarkerBehavior.PRESERVE,
+        update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler()],
+    )
+    result = r"""\id MAT
+\c 1
+\v 1 Este es el primer párrafo.
+\p Este texto está en inglés,
+\p y esta prueba es para marcadores de párrafo.
+"""
+    assert_usfm_equals(target, result)
+
+
 def scr_ref(*refs: str) -> List[ScriptureRef]:
     return [ScriptureRef.parse(ref) for ref in refs]
 
diff --git a/tests/corpora/test_segment_boundary_adjuster.py b/tests/corpora/test_segment_boundary_adjuster.py