Skip to content

Commit 00312e0

Browse files
author
Ben King
committed
Add SegmentBoundaryAdjuster + tests, and use it to adjust the placement of paragraph markers.
1 parent aa71f15 commit 00312e0

4 files changed

Lines changed: 651 additions & 0 deletions

File tree

machine/corpora/place_markers_usfm_update_block_handler.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from typing import List, TypedDict, cast
44

55
from ..translation.word_alignment_matrix import WordAlignmentMatrix
6+
from .segment_boundary_adjuster import SegmentBoundaryAdjuster
67
from .update_usfm_parser_handler import UpdateUsfmMarkerBehavior
78
from .usfm_token import UsfmToken, UsfmTokenType
89
from .usfm_update_block import UsfmUpdateBlock
@@ -137,6 +138,10 @@ def process_block(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock:
137138
for element, adj_src_tok in zip(to_place, adj_src_toks):
138139
adj_trg_tok = self._predict_marker_location(alignment_info["alignment"], adj_src_tok, src_toks, trg_toks)
139140

141+
# If inserting a paragraph marker, make small adjustments to place it in a more natural location
142+
if element.type == UsfmUpdateBlockElementType.PARAGRAPH:
143+
adj_trg_tok = SegmentBoundaryAdjuster().adjust_tokenized_segment_pair_boundaries(adj_trg_tok, trg_toks)
144+
140145
if (
141146
adj_trg_tok > 0
142147
and element.type == UsfmUpdateBlockElementType.STYLE
Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
from typing import List, Set, Tuple
2+
3+
import regex
4+
5+
6+
# This class is used by SegmentBoundaryAdjuster when it is dealing with tokenized text.
7+
class TokenRejoiner:
8+
9+
_NO_TRAILING_SPACE_CHARACTERS: Set[str] = {"(", "[", "{", "«", "‹", "“", "‘"}
10+
_NO_LEADING_SPACE_CHARACTERS: Set[str] = {",", ";", ":", ".", "!", "?", ")", "]", "}", "”", "’", "»", "›"}
11+
12+
def __init__(self) -> None:
13+
self._joined_text = ""
14+
self._num_tokens = 0
15+
16+
@classmethod
17+
def join_tokens(cls, tokens: List[str]) -> str:
18+
rejoiner = cls()
19+
for token in tokens:
20+
rejoiner.add_token_to_joined_text(token)
21+
if len(rejoiner._joined_text) > 0 and rejoiner._joined_text[-1] not in cls._NO_TRAILING_SPACE_CHARACTERS:
22+
rejoiner._joined_text += " "
23+
return rejoiner._joined_text
24+
25+
def add_token_to_joined_text(self, token: str) -> str:
26+
if self._num_tokens > 0:
27+
if (
28+
token not in self._NO_LEADING_SPACE_CHARACTERS
29+
and self._joined_text[-1] not in self._NO_TRAILING_SPACE_CHARACTERS
30+
):
31+
self._joined_text += " "
32+
self._joined_text += token
33+
self._num_tokens += 1
34+
return self._joined_text
35+
36+
37+
class SegmentBoundaryAdjuster:
38+
_PROHIBITED_VERSE_STARTING_CHARACTERS: Set[str] = {
39+
" ",
40+
",",
41+
";",
42+
":",
43+
".",
44+
"!",
45+
"?",
46+
")",
47+
"]",
48+
"}",
49+
"”",
50+
"’",
51+
}
52+
_PROHIBITED_VERSE_ENDING_CHARACTERS: Set[str] = {"(", "[", "{", "«", "‹", "“", "‘"}
53+
_PUNCTUATION_AND_SENTENCE_STARTING_PATTERN = regex.compile(r".*([^\w\s]\s*)(\p{Lu}\w+(\s+\w+)?(\s+\w+)?\s*)$")
54+
_WORDS_AND_SENTENCE_ENDING_PATTERN = regex.compile(r"^(\p{Ll}\w+(\s+\w+)?(\s+\w+)?)([\.,;:!\?\)\]”’]\s*[”’]*\s*)")
55+
56+
def adjust_segment_boundaries(self, verses: List[str]) -> List[str]:
57+
for i in range(len(verses) - 1):
58+
verses[i], verses[i + 1] = self.adjust_segment_pair_boundary(verses[i], verses[i + 1])
59+
return verses
60+
61+
def adjust_segment_pair_boundary(self, segment: str, next_segment: str) -> Tuple[str, str]:
62+
while len(next_segment) > 0 and next_segment[0] in self._PROHIBITED_VERSE_STARTING_CHARACTERS:
63+
segment += next_segment[0]
64+
next_segment = next_segment[1:]
65+
while len(segment) > 0 and segment[-1] in self._PROHIBITED_VERSE_ENDING_CHARACTERS:
66+
next_segment = segment[-1] + next_segment
67+
segment = segment[:-1]
68+
if self._segment_ends_with_start_of_sentence(segment):
69+
segment, next_segment = self._adjust_for_missed_sentence_start(segment, next_segment)
70+
if self._segment_starts_with_end_of_sentence(next_segment):
71+
segment, next_segment = self._adjust_for_late_sentence_end(segment, next_segment)
72+
return segment, next_segment
73+
74+
def _segment_ends_with_start_of_sentence(self, segment: str) -> bool:
75+
return self._PUNCTUATION_AND_SENTENCE_STARTING_PATTERN.match(segment) is not None
76+
77+
def _adjust_for_missed_sentence_start(self, segment: str, next_segment: str) -> Tuple[str, str]:
78+
match = self._PUNCTUATION_AND_SENTENCE_STARTING_PATTERN.match(segment)
79+
if match is not None:
80+
capitalized_word = match.group(2)
81+
segment = segment[: match.end(1)]
82+
next_segment = capitalized_word + ("" if capitalized_word[-1] == " " else " ") + next_segment
83+
return segment, next_segment
84+
85+
def _segment_starts_with_end_of_sentence(self, segment: str) -> bool:
86+
return self._WORDS_AND_SENTENCE_ENDING_PATTERN.match(segment) is not None
87+
88+
def _adjust_for_late_sentence_end(self, segment: str, next_segment: str) -> Tuple[str, str]:
89+
match = self._WORDS_AND_SENTENCE_ENDING_PATTERN.match(next_segment)
90+
if match is not None:
91+
words = match.group(1)
92+
punctuation = match.group(4)
93+
segment = segment + words + punctuation
94+
next_segment = next_segment[match.end(0) :]
95+
return segment, next_segment
96+
97+
def adjust_tokenized_segment_pair_boundaries(self, segment_boundary: int, tokens: List[str]) -> int:
98+
segment_text = TokenRejoiner.join_tokens(tokens[:segment_boundary])
99+
next_segment_text = TokenRejoiner.join_tokens(tokens[segment_boundary:])
100+
adjusted_segment_text = self.adjust_segment_pair_boundary(segment_text, next_segment_text)[0].strip()
101+
102+
return self._find_best_boundary_from_segment_length(tokens, len(adjusted_segment_text))
103+
104+
def _find_best_boundary_from_segment_length(self, tokens: List[str], target_segment_length: int) -> int:
105+
token_rejoiner = TokenRejoiner()
106+
107+
for index, token in enumerate(tokens):
108+
accumulated_length = len(token_rejoiner.add_token_to_joined_text(token))
109+
110+
if accumulated_length >= target_segment_length:
111+
# In the unlikely case that the adjusted boundary falls in the middle of a token
112+
# select the token boundary that is closest
113+
error_with_current_boundary = accumulated_length - target_segment_length
114+
error_with_previous_boundary = target_segment_length - (accumulated_length - len(token))
115+
116+
if error_with_current_boundary < error_with_previous_boundary:
117+
return index + 1
118+
else:
119+
return index
120+
121+
return len(tokens)

tests/corpora/test_place_markers_usfm_update_block_handler.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -692,6 +692,43 @@ def test_support_verse_zero():
692692
assert_usfm_equals(target, result)
693693

694694

695+
def test_adjustment_of_placed_paragraph_marker() -> None:
696+
source = "This is the first paragraph. This text is in English and this test is for paragraph markers."
697+
pretranslation = (
698+
"Este es el primer párrafo. Este texto está en inglés, y esta prueba es para marcadores de párrafo."
699+
)
700+
align_info = PlaceMarkersAlignmentInfo(
701+
source_tokens=[t for t in TOKENIZER.tokenize(source)],
702+
translation_tokens=[t for t in TOKENIZER.tokenize(pretranslation)],
703+
alignment=to_word_alignment_matrix(
704+
"0-0 1-1 2-2 3-3 4-4 5-5 6-6 7-7 8-8 9-9 10-10 11-11 11-12 12-13 13-14 14-15 15-16 16-19 17-17 18-20"
705+
),
706+
paragraph_behavior=UpdateUsfmMarkerBehavior.PRESERVE,
707+
style_behavior=UpdateUsfmMarkerBehavior.STRIP,
708+
)
709+
rows = [UpdateUsfmRow(scr_ref("MAT 1:1"), str(pretranslation), {"alignment_info": align_info})]
710+
usfm = r"""\id MAT
711+
\c 1
712+
\v 1 This is the first paragraph.
713+
\p This text is in English
714+
\p and this test is for paragraph markers.
715+
"""
716+
717+
target = update_usfm(
718+
rows,
719+
usfm,
720+
paragraph_behavior=UpdateUsfmMarkerBehavior.PRESERVE,
721+
update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler()],
722+
)
723+
result = r"""\id MAT
724+
\c 1
725+
\v 1 Este es el primer párrafo.
726+
\p Este texto está en inglés,
727+
\p y esta prueba es para marcadores de párrafo.
728+
"""
729+
assert_usfm_equals(target, result)
730+
731+
695732
def scr_ref(*refs: str) -> List[ScriptureRef]:
696733
return [ScriptureRef.parse(ref) for ref in refs]
697734

0 commit comments

Comments
 (0)