Skip to content

Commit 531934a

Browse files
authored
Pass metadata through update block (#202)
1 parent d8aa497 commit 531934a

9 files changed

+343
-286
lines changed

machine/corpora/__init__.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,12 @@
5151
normalize,
5252
unescape_spaces,
5353
)
54-
from .update_usfm_parser_handler import UpdateUsfmMarkerBehavior, UpdateUsfmParserHandler, UpdateUsfmTextBehavior
54+
from .update_usfm_parser_handler import (
55+
UpdateUsfmMarkerBehavior,
56+
UpdateUsfmParserHandler,
57+
UpdateUsfmRow,
58+
UpdateUsfmTextBehavior,
59+
)
5560
from .usfm_file_text import UsfmFileText
5661
from .usfm_file_text_corpus import UsfmFileTextCorpus
5762
from .usfm_memory_text import UsfmMemoryText
@@ -135,6 +140,7 @@
135140
"UpdateUsfmMarkerBehavior",
136141
"UpdateUsfmParserHandler",
137142
"UpdateUsfmTextBehavior",
143+
"UpdateUsfmRow",
138144
"UsfmAttribute",
139145
"UsfmElementType",
140146
"UsfmFileText",

machine/corpora/paratext_project_text_updater_base.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,15 @@
11
from abc import ABC, abstractmethod
2-
from typing import BinaryIO, Iterable, Optional, Sequence, Tuple, Union
2+
from typing import BinaryIO, Iterable, Optional, Sequence, Union
33

44
from ..utils.typeshed import StrPath
55
from .paratext_project_settings import ParatextProjectSettings
66
from .paratext_project_settings_parser_base import ParatextProjectSettingsParserBase
7-
from .scripture_ref import ScriptureRef
8-
from .update_usfm_parser_handler import UpdateUsfmMarkerBehavior, UpdateUsfmParserHandler, UpdateUsfmTextBehavior
7+
from .update_usfm_parser_handler import (
8+
UpdateUsfmMarkerBehavior,
9+
UpdateUsfmParserHandler,
10+
UpdateUsfmRow,
11+
UpdateUsfmTextBehavior,
12+
)
913
from .usfm_parser import parse_usfm
1014
from .usfm_update_block_handler import UsfmUpdateBlockHandler
1115

@@ -20,7 +24,7 @@ def __init__(self, settings: Union[ParatextProjectSettings, ParatextProjectSetti
2024
def update_usfm(
2125
self,
2226
book_id: str,
23-
rows: Optional[Sequence[Tuple[Sequence[ScriptureRef], str]]] = None,
27+
rows: Optional[Sequence[UpdateUsfmRow]] = None,
2428
full_name: Optional[str] = None,
2529
text_behavior: UpdateUsfmTextBehavior = UpdateUsfmTextBehavior.PREFER_EXISTING,
2630
paragraph_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.PRESERVE,

machine/corpora/place_markers_usfm_update_block_handler.py

Lines changed: 12 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,36 +1,36 @@
11
from __future__ import annotations
22

3-
from typing import Iterable, List, TypedDict
3+
from typing import List, TypedDict, cast
44

55
from ..translation.word_alignment_matrix import WordAlignmentMatrix
66
from .usfm_token import UsfmToken, UsfmTokenType
77
from .usfm_update_block import UsfmUpdateBlock
88
from .usfm_update_block_element import UsfmUpdateBlockElement, UsfmUpdateBlockElementType
99
from .usfm_update_block_handler import UsfmUpdateBlockHandler
1010

11+
PLACE_MARKERS_ALIGNMENT_INFO_KEY = "alignment_info"
12+
1113

1214
class PlaceMarkersAlignmentInfo(TypedDict):
13-
refs: List[str]
1415
source_tokens: List[str]
1516
translation_tokens: List[str]
1617
alignment: WordAlignmentMatrix
1718

1819

1920
class PlaceMarkersUsfmUpdateBlockHandler(UsfmUpdateBlockHandler):
2021

21-
def __init__(self, align_info: Iterable[PlaceMarkersAlignmentInfo]) -> None:
22-
self._align_info = {info["refs"][0]: info for info in align_info}
23-
2422
def process_block(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock:
25-
ref = str(block.refs[0])
2623
elements = list(block.elements)
2724

2825
# Nothing to do if there are no markers to place or no alignment to use
26+
if PLACE_MARKERS_ALIGNMENT_INFO_KEY not in block.metadata:
27+
return block
28+
29+
alignment_info = cast(PlaceMarkersAlignmentInfo, block.metadata[PLACE_MARKERS_ALIGNMENT_INFO_KEY])
2930
if (
3031
len(elements) == 0
31-
or ref not in self._align_info.keys()
32-
or self._align_info[ref]["alignment"].row_count == 0
33-
or self._align_info[ref]["alignment"].column_count == 0
32+
or alignment_info["alignment"].row_count == 0
33+
or alignment_info["alignment"].column_count == 0
3434
or not any(
3535
(
3636
e.type in [UsfmUpdateBlockElementType.PARAGRAPH, UsfmUpdateBlockElementType.STYLE]
@@ -65,8 +65,8 @@ def process_block(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock:
6565
):
6666
eob_empty_paras = False
6767

68-
src_toks = self._align_info[ref]["source_tokens"]
69-
trg_toks = self._align_info[ref]["translation_tokens"]
68+
src_toks: List[str] = alignment_info["source_tokens"]
69+
trg_toks: List[str] = alignment_info["translation_tokens"]
7070
src_tok_idx = 0
7171

7272
src_sent = ""
@@ -112,9 +112,7 @@ def process_block(self, block: UsfmUpdateBlock) -> UsfmUpdateBlock:
112112
# Predict marker placements and get insertion order
113113
to_insert = []
114114
for element, adj_src_tok in zip(to_place, adj_src_toks):
115-
adj_trg_tok = self._predict_marker_location(
116-
self._align_info[ref]["alignment"], adj_src_tok, src_toks, trg_toks
117-
)
115+
adj_trg_tok = self._predict_marker_location(alignment_info["alignment"], adj_src_tok, src_toks, trg_toks)
118116

119117
if (
120118
adj_trg_tok > 0

machine/corpora/update_usfm_parser_handler.py

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -24,10 +24,17 @@ class UpdateUsfmMarkerBehavior(Enum):
2424
STRIP = auto()
2525

2626

27+
class UpdateUsfmRow:
28+
def __init__(self, refs: Sequence[ScriptureRef], text: str, metadata: Optional[dict[str, object]] = None):
29+
self.refs = refs
30+
self.text = text
31+
self.metadata = metadata
32+
33+
2734
class UpdateUsfmParserHandler(ScriptureRefUsfmParserHandler):
2835
def __init__(
2936
self,
30-
rows: Optional[Sequence[Tuple[Sequence[ScriptureRef], str]]] = None,
37+
rows: Optional[Sequence[UpdateUsfmRow]] = None,
3138
id_text: Optional[str] = None,
3239
text_behavior: UpdateUsfmTextBehavior = UpdateUsfmTextBehavior.PREFER_EXISTING,
3340
paragraph_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.PRESERVE,
@@ -284,12 +291,14 @@ def get_usfm(self, stylesheet: Union[str, UsfmStylesheet] = "usfm.sty") -> str:
284291
tokenizer = UsfmTokenizer(stylesheet)
285292
return tokenizer.detokenize(self._tokens)
286293

287-
def _advance_rows(self, seg_scr_refs: Sequence[ScriptureRef]) -> List[str]:
294+
def _advance_rows(self, seg_scr_refs: Sequence[ScriptureRef]) -> Tuple[List[str], Optional[dict[str, object]]]:
288295
row_texts: List[str] = []
296+
row_metadata = None
289297
source_index: int = 0
290298
while self._row_index < len(self._rows) and source_index < len(seg_scr_refs):
291299
compare: int = 0
292-
row_scr_refs, text = self._rows[self._row_index]
300+
row = self._rows[self._row_index]
301+
row_scr_refs, text, metadata = row.refs, row.text, row.metadata
293302
for row_scr_ref in row_scr_refs:
294303
while source_index < len(seg_scr_refs):
295304
compare = row_scr_ref.compare_to(seg_scr_refs[source_index], compare_segments=False)
@@ -302,11 +311,12 @@ def _advance_rows(self, seg_scr_refs: Sequence[ScriptureRef]) -> List[str]:
302311
# source and row match
303312
# grab the text - both source and row will be incremented in due time...
304313
row_texts.append(text)
314+
row_metadata = metadata
305315
break
306316
if compare <= 0:
307317
# source is ahead of row, increment row
308318
self._row_index += 1
309-
return row_texts
319+
return row_texts, row_metadata
310320

311321
def _collect_updatable_tokens(self, state: UsfmParserState) -> None:
312322
self._use_updated_text()
@@ -377,8 +387,10 @@ def _has_new_text(self) -> bool:
377387
return any(self._replace_stack) and self._replace_stack[-1]
378388

379389
def _start_update_block(self, scripture_refs: Sequence[ScriptureRef]) -> None:
380-
self._update_block_stack.append(UsfmUpdateBlock(scripture_refs))
381-
row_texts: List[str] = self._advance_rows(scripture_refs)
390+
row_texts, metadata = self._advance_rows(scripture_refs)
391+
self._update_block_stack.append(
392+
UsfmUpdateBlock(scripture_refs, metadata=metadata if metadata is not None else {})
393+
)
382394
self._push_updated_text([UsfmToken(UsfmTokenType.TEXT, text=t + " ") for t in row_texts])
383395

384396
def _end_update_block(self, state: UsfmParserState, scripture_refs: Sequence[ScriptureRef]) -> None:

machine/corpora/usfm_update_block.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,15 @@
88

99

1010
class UsfmUpdateBlock:
11-
def __init__(self, refs: Iterable[ScriptureRef] = [], elements: Iterable[UsfmUpdateBlockElement] = []) -> None:
11+
def __init__(
12+
self,
13+
refs: Iterable[ScriptureRef] = [],
14+
elements: Iterable[UsfmUpdateBlockElement] = [],
15+
metadata: dict[str, object] = {},
16+
) -> None:
1217
self._refs: list[ScriptureRef] = list(refs)
1318
self._elements: list[UsfmUpdateBlockElement] = list(elements)
19+
self._metadata: dict[str, object] = metadata
1420

1521
@property
1622
def refs(self) -> Sequence[ScriptureRef]:
@@ -20,6 +26,10 @@ def refs(self) -> Sequence[ScriptureRef]:
2026
def elements(self) -> Sequence[UsfmUpdateBlockElement]:
2127
return self._elements
2228

29+
@property
30+
def metadata(self) -> dict[str, object]:
31+
return self._metadata
32+
2333
def add_text(self, tokens: Iterable[UsfmToken]) -> None:
2434
self._elements.append(UsfmUpdateBlockElement(UsfmUpdateBlockElementType.TEXT, list(tokens)))
2535

@@ -58,7 +68,7 @@ def get_tokens(self) -> list[UsfmToken]:
5868
return [token for element in self._elements for token in element.get_tokens()]
5969

6070
def __eq__(self, other: UsfmUpdateBlock) -> bool:
61-
return self._refs == other._refs and self._elements == other._elements
71+
return self._refs == other._refs and self._elements == other._elements and self._metadata == other._metadata
6272

6373
def copy(self) -> UsfmUpdateBlock:
64-
return UsfmUpdateBlock(self._refs, self._elements)
74+
return UsfmUpdateBlock(self._refs, self._elements, self._metadata)

0 commit comments

Comments
 (0)