|
| 1 | +from typing import List, Set, Tuple |
| 2 | + |
| 3 | +import regex |
| 4 | + |
| 5 | + |
| 6 | +# This class is used by SegmentBoundaryAdjuster when it is dealing with tokenized text. |
| 7 | +class TokenRejoiner: |
| 8 | + |
| 9 | + _NO_TRAILING_SPACE_CHARACTERS: Set[str] = {"(", "[", "{", "«", "‹", "“", "‘"} |
| 10 | + _NO_LEADING_SPACE_CHARACTERS: Set[str] = {",", ";", ":", ".", "!", "?", ")", "]", "}", "”", "’", "»", "›"} |
| 11 | + |
| 12 | + def __init__(self) -> None: |
| 13 | + self._joined_text = "" |
| 14 | + self._num_tokens = 0 |
| 15 | + |
| 16 | + @classmethod |
| 17 | + def join_tokens(cls, tokens: List[str]) -> str: |
| 18 | + rejoiner = cls() |
| 19 | + for token in tokens: |
| 20 | + rejoiner.add_token_to_joined_text(token) |
| 21 | + if len(rejoiner._joined_text) > 0 and rejoiner._joined_text[-1] not in cls._NO_TRAILING_SPACE_CHARACTERS: |
| 22 | + rejoiner._joined_text += " " |
| 23 | + return rejoiner._joined_text |
| 24 | + |
| 25 | + def add_token_to_joined_text(self, token: str) -> str: |
| 26 | + if self._num_tokens > 0: |
| 27 | + if ( |
| 28 | + token not in self._NO_LEADING_SPACE_CHARACTERS |
| 29 | + and self._joined_text[-1] not in self._NO_TRAILING_SPACE_CHARACTERS |
| 30 | + ): |
| 31 | + self._joined_text += " " |
| 32 | + self._joined_text += token |
| 33 | + self._num_tokens += 1 |
| 34 | + return self._joined_text |
| 35 | + |
| 36 | + |
| 37 | +class SegmentBoundaryAdjuster: |
| 38 | + _PROHIBITED_VERSE_STARTING_CHARACTERS: Set[str] = { |
| 39 | + " ", |
| 40 | + ",", |
| 41 | + ";", |
| 42 | + ":", |
| 43 | + ".", |
| 44 | + "!", |
| 45 | + "?", |
| 46 | + ")", |
| 47 | + "]", |
| 48 | + "}", |
| 49 | + "”", |
| 50 | + "’", |
| 51 | + } |
| 52 | + _PROHIBITED_VERSE_ENDING_CHARACTERS: Set[str] = {"(", "[", "{", "«", "‹", "“", "‘"} |
| 53 | + _PUNCTUATION_AND_SENTENCE_STARTING_PATTERN = regex.compile(r".*([^\w\s]\s*)(\p{Lu}\w+(\s+\w+)?(\s+\w+)?\s*)$") |
| 54 | + _WORDS_AND_SENTENCE_ENDING_PATTERN = regex.compile(r"^(\p{Ll}\w+(\s+\w+)?(\s+\w+)?)([\.,;:!\?\)\]”’]\s*[”’]*\s*)") |
| 55 | + |
| 56 | + def adjust_segment_boundaries(self, verses: List[str]) -> List[str]: |
| 57 | + for i in range(len(verses) - 1): |
| 58 | + verses[i], verses[i + 1] = self.adjust_segment_pair_boundary(verses[i], verses[i + 1]) |
| 59 | + return verses |
| 60 | + |
| 61 | + def adjust_segment_pair_boundary(self, segment: str, next_segment: str) -> Tuple[str, str]: |
| 62 | + while len(next_segment) > 0 and next_segment[0] in self._PROHIBITED_VERSE_STARTING_CHARACTERS: |
| 63 | + segment += next_segment[0] |
| 64 | + next_segment = next_segment[1:] |
| 65 | + while len(segment) > 0 and segment[-1] in self._PROHIBITED_VERSE_ENDING_CHARACTERS: |
| 66 | + next_segment = segment[-1] + next_segment |
| 67 | + segment = segment[:-1] |
| 68 | + if self._segment_ends_with_start_of_sentence(segment): |
| 69 | + segment, next_segment = self._adjust_for_missed_sentence_start(segment, next_segment) |
| 70 | + if self._segment_starts_with_end_of_sentence(next_segment): |
| 71 | + segment, next_segment = self._adjust_for_late_sentence_end(segment, next_segment) |
| 72 | + return segment, next_segment |
| 73 | + |
| 74 | + def _segment_ends_with_start_of_sentence(self, segment: str) -> bool: |
| 75 | + return self._PUNCTUATION_AND_SENTENCE_STARTING_PATTERN.match(segment) is not None |
| 76 | + |
| 77 | + def _adjust_for_missed_sentence_start(self, segment: str, next_segment: str) -> Tuple[str, str]: |
| 78 | + match = self._PUNCTUATION_AND_SENTENCE_STARTING_PATTERN.match(segment) |
| 79 | + if match is not None: |
| 80 | + capitalized_word = match.group(2) |
| 81 | + segment = segment[: match.end(1)] |
| 82 | + next_segment = capitalized_word + ("" if capitalized_word[-1] == " " else " ") + next_segment |
| 83 | + return segment, next_segment |
| 84 | + |
| 85 | + def _segment_starts_with_end_of_sentence(self, segment: str) -> bool: |
| 86 | + return self._WORDS_AND_SENTENCE_ENDING_PATTERN.match(segment) is not None |
| 87 | + |
| 88 | + def _adjust_for_late_sentence_end(self, segment: str, next_segment: str) -> Tuple[str, str]: |
| 89 | + match = self._WORDS_AND_SENTENCE_ENDING_PATTERN.match(next_segment) |
| 90 | + if match is not None: |
| 91 | + words = match.group(1) |
| 92 | + punctuation = match.group(4) |
| 93 | + segment = segment + words + punctuation |
| 94 | + next_segment = next_segment[match.end(0) :] |
| 95 | + return segment, next_segment |
| 96 | + |
| 97 | + def adjust_tokenized_segment_pair_boundaries(self, segment_boundary: int, tokens: List[str]) -> int: |
| 98 | + segment_text = TokenRejoiner.join_tokens(tokens[:segment_boundary]) |
| 99 | + next_segment_text = TokenRejoiner.join_tokens(tokens[segment_boundary:]) |
| 100 | + adjusted_segment_text = self.adjust_segment_pair_boundary(segment_text, next_segment_text)[0].strip() |
| 101 | + |
| 102 | + return self._find_best_boundary_from_segment_length(tokens, len(adjusted_segment_text)) |
| 103 | + |
| 104 | + def _find_best_boundary_from_segment_length(self, tokens: List[str], target_segment_length: int) -> int: |
| 105 | + token_rejoiner = TokenRejoiner() |
| 106 | + |
| 107 | + for index, token in enumerate(tokens): |
| 108 | + accumulated_length = len(token_rejoiner.add_token_to_joined_text(token)) |
| 109 | + |
| 110 | + if accumulated_length >= target_segment_length: |
| 111 | + # In the unlikely case that the adjusted boundary falls in the middle of a token |
| 112 | + # select the token boundary that is closest |
| 113 | + error_with_current_boundary = accumulated_length - target_segment_length |
| 114 | + error_with_previous_boundary = target_segment_length - (accumulated_length - len(token)) |
| 115 | + |
| 116 | + if error_with_current_boundary < error_with_previous_boundary: |
| 117 | + return index + 1 |
| 118 | + else: |
| 119 | + return index |
| 120 | + |
| 121 | + return len(tokens) |
0 commit comments