Skip to content

Commit bd90063

Browse files
committed
Add marker behavior to metadata for marker placement, fix bug related to headers when stripping paragraphs
1 parent 531934a commit bd90063

File tree

2 files changed

+95
-4
lines changed

2 files changed

+95
-4
lines changed

machine/corpora/place_markers_usfm_update_block_handler.py

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from typing import List, TypedDict, cast
44

55
from ..translation.word_alignment_matrix import WordAlignmentMatrix
6+
from .update_usfm_parser_handler import UpdateUsfmMarkerBehavior
67
from .usfm_token import UsfmToken, UsfmTokenType
78
from .usfm_update_block import UsfmUpdateBlock
89
from .usfm_update_block_element import UsfmUpdateBlockElement, UsfmUpdateBlockElementType
@@ -15,6 +16,8 @@ class PlaceMarkersAlignmentInfo(TypedDict):
1516
source_tokens: List[str]
1617
translation_tokens: List[str]
1718
alignment: WordAlignmentMatrix
19+
paragraph_behavior: UpdateUsfmMarkerBehavior
20+
style_behavior: UpdateUsfmMarkerBehavior
1821

1922

2023
class PlaceMarkersUsfmUpdateBlockHandler(UsfmUpdateBlockHandler):
@@ -33,9 +36,15 @@ def process_block(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock:
3336
or alignment_info["alignment"].column_count == 0
3437
or not any(
3538
(
36-
e.type in [UsfmUpdateBlockElementType.PARAGRAPH, UsfmUpdateBlockElementType.STYLE]
37-
and not e.marked_for_removal
38-
and len(e.tokens) == 1
39+
(
40+
e.type == UsfmUpdateBlockElementType.PARAGRAPH
41+
and alignment_info["paragraph_behavior"] == UpdateUsfmMarkerBehavior.PRESERVE
42+
and len(e.tokens) == 1
43+
)
44+
or (
45+
e.type == UsfmUpdateBlockElementType.STYLE
46+
and alignment_info["style_behavior"] == UpdateUsfmMarkerBehavior.PRESERVE
47+
)
3948
)
4049
for e in elements
4150
)
@@ -92,7 +101,10 @@ def process_block(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock:
92101
else:
93102
trg_sent += element.tokens[0].to_usfm()
94103

95-
if element.marked_for_removal:
104+
if element.marked_for_removal or (
105+
element.type == UsfmUpdateBlockElementType.PARAGRAPH
106+
and alignment_info["paragraph_behavior"] == UpdateUsfmMarkerBehavior.STRIP
107+
):
96108
ignored_elements.append(element)
97109
elif element.type == UsfmUpdateBlockElementType.EMBED:
98110
embed_elements.append(element)

tests/corpora/test_place_markers_usfm_update_block_handler.py

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@ def test_paragraph_markers() -> None:
2727
alignment=to_word_alignment_matrix(
2828
"0-0 1-1 2-2 3-3 4-4 5-5 6-6 7-7 8-8 9-9 10-10 12-11 13-12 14-13 15-14 16-15 17-18 18-16 19-19"
2929
),
30+
paragraph_behavior=UpdateUsfmMarkerBehavior.PRESERVE,
31+
style_behavior=UpdateUsfmMarkerBehavior.STRIP,
3032
)
3133
rows = [UpdateUsfmRow(scr_ref("MAT 1:1"), str(pretranslation), {"alignment_info": align_info})]
3234
usfm = r"""\id MAT
@@ -60,6 +62,8 @@ def test_style_markers() -> None:
6062
alignment=to_word_alignment_matrix(
6163
"0-0 1-1 2-2 3-3 4-4 5-5 6-6 7-7 8-8 9-9 10-10 12-11 13-12 14-13 15-14 16-15 17-18 18-16 19-19"
6264
),
65+
paragraph_behavior=UpdateUsfmMarkerBehavior.PRESERVE,
66+
style_behavior=UpdateUsfmMarkerBehavior.PRESERVE,
6367
)
6468
rows = [UpdateUsfmRow(scr_ref("MAT 1:1"), str(pretranslation), metadata={"alignment_info": align_info})]
6569
usfm = r"""\id MAT
@@ -79,6 +83,16 @@ def test_style_markers() -> None:
7983
"""
8084
assess(target, result)
8185

86+
align_info = PlaceMarkersAlignmentInfo(
87+
source_tokens=[t for t in TOKENIZER.tokenize(source)],
88+
translation_tokens=[t for t in TOKENIZER.tokenize(pretranslation)],
89+
alignment=to_word_alignment_matrix(
90+
"0-0 1-1 2-2 3-3 4-4 5-5 6-6 7-7 8-8 9-9 10-10 12-11 13-12 14-13 15-14 16-15 17-18 18-16 19-19"
91+
),
92+
paragraph_behavior=UpdateUsfmMarkerBehavior.PRESERVE,
93+
style_behavior=UpdateUsfmMarkerBehavior.STRIP,
94+
)
95+
rows = [UpdateUsfmRow(scr_ref("MAT 1:1"), str(pretranslation), metadata={"alignment_info": align_info})]
8296
target = update_usfm(
8397
rows,
8498
usfm,
@@ -159,6 +173,8 @@ def test_trailing_empty_paragraphs() -> None:
159173
source_tokens=["Verse", "1"],
160174
translation_tokens=["New", "verse", "1"],
161175
alignment=to_word_alignment_matrix("0-1 1-2"),
176+
paragraph_behavior=UpdateUsfmMarkerBehavior.PRESERVE,
177+
style_behavior=UpdateUsfmMarkerBehavior.STRIP,
162178
)
163179
},
164180
)
@@ -197,6 +213,8 @@ def test_headers() -> None:
197213
source_tokens=["A", "B", "C"],
198214
translation_tokens=["X", "Y", "Z"],
199215
alignment=to_word_alignment_matrix("0-0 1-1 2-2"),
216+
paragraph_behavior=UpdateUsfmMarkerBehavior.PRESERVE,
217+
style_behavior=UpdateUsfmMarkerBehavior.STRIP,
200218
)
201219
},
202220
),
@@ -208,6 +226,8 @@ def test_headers() -> None:
208226
source_tokens=["A"],
209227
translation_tokens=["X"],
210228
alignment=to_word_alignment_matrix("0-0"),
229+
paragraph_behavior=UpdateUsfmMarkerBehavior.PRESERVE,
230+
style_behavior=UpdateUsfmMarkerBehavior.STRIP,
211231
)
212232
},
213233
),
@@ -276,6 +296,8 @@ def test_consecutive_markers() -> None:
276296
source_tokens=["Old", "verse", "1", "word"],
277297
translation_tokens=["New", "verse", "1", "WORD"],
278298
alignment=to_word_alignment_matrix("0-0 1-1 2-2 3-3"),
299+
paragraph_behavior=UpdateUsfmMarkerBehavior.PRESERVE,
300+
style_behavior=UpdateUsfmMarkerBehavior.PRESERVE,
279301
)
280302
},
281303
)
@@ -311,6 +333,8 @@ def test_verse_ranges() -> None:
311333
source_tokens=["Verse", "range", "old", "paragraph", "2"],
312334
translation_tokens=["New", "verse", "range", "text", "new", "paragraph", "2"],
313335
alignment=to_word_alignment_matrix("0-1 1-2 2-4 3-5 4-6"),
336+
paragraph_behavior=UpdateUsfmMarkerBehavior.PRESERVE,
337+
style_behavior=UpdateUsfmMarkerBehavior.STRIP,
314338
)
315339
},
316340
)
@@ -346,6 +370,8 @@ def test_no_update() -> None:
346370
source_tokens=["Old", "paragraph", "1", "Old", "paragraph", "2"],
347371
translation_tokens=["New", "paragraph", "1", "New", "paragraph", "2"],
348372
alignment=to_word_alignment_matrix("0-0 1-1 2-2 3-3 4-4 5-5"),
373+
paragraph_behavior=UpdateUsfmMarkerBehavior.STRIP,
374+
style_behavior=UpdateUsfmMarkerBehavior.STRIP,
349375
)
350376
},
351377
)
@@ -378,6 +404,8 @@ def test_no_update() -> None:
378404
source_tokens=[],
379405
translation_tokens=[],
380406
alignment=to_word_alignment_matrix(""),
407+
paragraph_behavior=UpdateUsfmMarkerBehavior.PRESERVE,
408+
style_behavior=UpdateUsfmMarkerBehavior.STRIP,
381409
)
382410
},
383411
)
@@ -422,6 +450,8 @@ def test_split_tokens() -> None:
422450
source_tokens=["words", "split", "words", "split", "words", "split"],
423451
translation_tokens=["words", "split", "words", "split", "words", "split"],
424452
alignment=to_word_alignment_matrix("0-0 1-1 2-2 3-3 4-4 5-5"),
453+
paragraph_behavior=UpdateUsfmMarkerBehavior.PRESERVE,
454+
style_behavior=UpdateUsfmMarkerBehavior.STRIP,
425455
)
426456
},
427457
)
@@ -458,6 +488,8 @@ def test_no_text() -> None:
458488
source_tokens=[],
459489
translation_tokens=[],
460490
alignment=to_word_alignment_matrix(""),
491+
paragraph_behavior=UpdateUsfmMarkerBehavior.PRESERVE,
492+
style_behavior=UpdateUsfmMarkerBehavior.PRESERVE,
461493
)
462494
},
463495
)
@@ -491,6 +523,8 @@ def test_consecutive_substring() -> None:
491523
source_tokens=["string", "ring"],
492524
translation_tokens=["string", "ring"],
493525
alignment=to_word_alignment_matrix("0-0 1-1"),
526+
paragraph_behavior=UpdateUsfmMarkerBehavior.PRESERVE,
527+
style_behavior=UpdateUsfmMarkerBehavior.STRIP,
494528
)
495529
},
496530
)
@@ -525,6 +559,8 @@ def test_verses_out_of_order() -> None:
525559
source_tokens=["verse", "1", "paragraph", "2"],
526560
translation_tokens=["new", "verse", "1", "new", "paragraph", "2"],
527561
alignment=to_word_alignment_matrix("0-1 1-2 2-4 3-5"),
562+
paragraph_behavior=UpdateUsfmMarkerBehavior.PRESERVE,
563+
style_behavior=UpdateUsfmMarkerBehavior.STRIP,
528564
)
529565
},
530566
),
@@ -536,6 +572,8 @@ def test_verses_out_of_order() -> None:
536572
source_tokens=["verse", "2"],
537573
translation_tokens=["new", "verse", "2"],
538574
alignment=to_word_alignment_matrix("0-1 1-2"),
575+
paragraph_behavior=UpdateUsfmMarkerBehavior.PRESERVE,
576+
style_behavior=UpdateUsfmMarkerBehavior.STRIP,
539577
)
540578
},
541579
),
@@ -562,6 +600,47 @@ def test_verses_out_of_order() -> None:
562600
assess(target, result)
563601

564602

603+
def test_strip_paragraphs_with_header() -> None:
604+
rows = [
605+
UpdateUsfmRow(
606+
scr_ref("MAT 1:1"),
607+
"new verse 1 new paragraph 2",
608+
metadata={
609+
"alignment_info": PlaceMarkersAlignmentInfo(
610+
source_tokens=["verse", "1", "paragraph", "2"],
611+
translation_tokens=["new", "verse", "1", "new", "paragraph", "2"],
612+
alignment=to_word_alignment_matrix("0-1 1-2 2-4 3-5"),
613+
paragraph_behavior=UpdateUsfmMarkerBehavior.STRIP,
614+
style_behavior=UpdateUsfmMarkerBehavior.PRESERVE,
615+
)
616+
},
617+
)
618+
]
619+
usfm = r"""\id MAT
620+
\c 1
621+
\v 1 verse 1
622+
\s header
623+
\p paragraph 2
624+
\v 2 verse 2
625+
"""
626+
627+
target = update_usfm(
628+
rows,
629+
usfm,
630+
paragraph_behavior=UpdateUsfmMarkerBehavior.STRIP,
631+
style_behavior=UpdateUsfmMarkerBehavior.PRESERVE,
632+
update_block_handlers=[PlaceMarkersUsfmUpdateBlockHandler()],
633+
)
634+
result = r"""\id MAT
635+
\c 1
636+
\v 1 new verse 1 new paragraph 2
637+
\s header
638+
\p
639+
\v 2 verse 2
640+
"""
641+
assess(target, result)
642+
643+
565644
def scr_ref(*refs: str) -> List[ScriptureRef]:
566645
return [ScriptureRef.parse(ref) for ref in refs]
567646

0 commit comments

Comments
 (0)