|
| 1 | +""" |
| 2 | +Given text and Terms, determine what to render in the browser. |
| 3 | +
|
| 4 | +For example, given the following TextTokens A-I: |
| 5 | +
|
| 6 | + A B C D E F G H I |
| 7 | +
|
| 8 | +And the following terms: |
| 9 | +
|
| 10 | + "A" through "I" (single-word terms) |
| 11 | + "B C" (term J) |
| 12 | + "E F G H I" (K) |
| 13 | + "F G" (L) |
| 14 | + "C D E" (M) |
| 15 | +
|
| 16 | +The following TextItems would be displayed on the reading screen, |
| 17 | +with some of the Terms overlapping: |
| 18 | +
|
| 19 | + [A][B C][-D E][-F G H I] |
| 20 | +""" |
| 21 | + |
| 22 | +import re |
| 23 | +from lute.models.term import Term |
| 24 | +from lute.read.render.text_item import TextItem |
| 25 | + |
| 26 | +zws = "\u200B" # zero-width space |
| 27 | + |
| 28 | + |
| 29 | +def get_string_indexes(strings, content): |
| 30 | + """ |
| 31 | + Returns list of arrays: [[string, index], ...] |
| 32 | +
|
| 33 | + e.g., _get_string_indexes(["is a", "cat"], "here is a cat") |
| 34 | + returns [("is a", 1), ("cat", 3)]. |
| 35 | +
|
| 36 | + strings and content must be lowercased! |
| 37 | + """ |
| 38 | + searchcontent = zws + content + zws |
| 39 | + zwsindexes = [index for index, letter in enumerate(searchcontent) if letter == zws] |
| 40 | + |
| 41 | + ret = [] |
| 42 | + |
| 43 | + for s in strings: |
| 44 | + # "(?=())" is required because sometimes the search pattern can |
| 45 | + # overlap -- e.g. _b_b_ has _b_ *twice*. |
| 46 | + # https://stackoverflow.com/questions/5616822/ |
| 47 | + # how-to-use-regex-to-find-all-overlapping-matches |
| 48 | + pattern = rf"(?=({re.escape(zws + s + zws)}))" |
| 49 | + add_matches = [ |
| 50 | + (s, zwsindexes.index(m.start())) |
| 51 | + for m in re.finditer(pattern, searchcontent) |
| 52 | + ] |
| 53 | + ret.extend(add_matches) |
| 54 | + |
| 55 | + return ret |
| 56 | + |
| 57 | + |
| 58 | +def _make_textitem(index, text, text_lc, sentence_number, term): |
| 59 | + "Make a TextItem." |
| 60 | + r = TextItem() |
| 61 | + r.text = text |
| 62 | + r.sentence_number = sentence_number |
| 63 | + r.text_lc = text_lc |
| 64 | + r.token_count = text.count(zws) + 1 |
| 65 | + r.display_count = r.token_count |
| 66 | + r.index = index |
| 67 | + r.is_word = term is not None |
| 68 | + r.term = term |
| 69 | + return r |
| 70 | + |
| 71 | + |
| 72 | +def _create_missing_status_0_terms(tokens, terms, language): |
| 73 | + "Make new terms as needed for all tokens, using case of last instance." |
| 74 | + |
| 75 | + original_word_tokens = {t.token for t in tokens if t.is_word} |
| 76 | + parser = language.parser |
| 77 | + lc_word_tokens = {parser.get_lowercase(t): t for t in original_word_tokens} |
| 78 | + term_text_lcs = {t.text_lc for t in terms} |
| 79 | + |
| 80 | + missing_word_tokens = [ |
| 81 | + original for lc, original in lc_word_tokens.items() if lc not in term_text_lcs |
| 82 | + ] |
| 83 | + |
| 84 | + # Note: create the terms _without parsing_ because some parsers |
| 85 | + # break up characters when the words are given out of context. |
| 86 | + missing_word_tokens = list(set(missing_word_tokens)) |
| 87 | + new_terms = [Term.create_term_no_parsing(language, t) for t in missing_word_tokens] |
| 88 | + for t in new_terms: |
| 89 | + t.status = 0 |
| 90 | + |
| 91 | + return new_terms |
| 92 | + |
| 93 | + |
| 94 | +def get_textitems(tokens, terms, language): |
| 95 | + """ |
| 96 | + Return TextItems that will **actually be rendered**. |
| 97 | +
|
| 98 | + Method to determine what should be rendered: |
| 99 | +
|
| 100 | + - Create TextItems for all of the tokens, finding their |
| 101 | + starting index in the tokens. |
| 102 | +
|
| 103 | + - "Write" the TextItems to an array in correctly sorted |
| 104 | + order, so that the correct TextItems take precendence |
| 105 | + in the final rendering. |
| 106 | +
|
| 107 | + - Calculate any term overlaps. |
| 108 | +
|
| 109 | + - Return the final list of TextItems that will actually |
| 110 | + be rendered. |
| 111 | +
|
| 112 | + --- |
| 113 | +
|
| 114 | + Applying the above algorithm to the example given in the class |
| 115 | + header: |
| 116 | +
|
| 117 | + We have the following TextTokens A-I: |
| 118 | +
|
| 119 | + A B C D E F G H I |
| 120 | +
|
| 121 | + And given the following terms: |
| 122 | + "A" through "I" (single-word terms) |
| 123 | + "B C" (term J) |
| 124 | + "E F G H I" (K) |
| 125 | + "F G" (L) |
| 126 | + "C D E" (M) |
| 127 | +
|
| 128 | + Creating TextItems for all of the terms, finding their starting |
| 129 | + indices in the tokens: |
| 130 | +
|
| 131 | + TextToken index length |
| 132 | + ---- ----- ------ |
| 133 | + [A] 0 1 |
| 134 | + [B] 1 1 |
| 135 | + ... |
| 136 | + [I] 8 1 |
| 137 | + [B C] 1 2 |
| 138 | + [E F G H I] 4 5 |
| 139 | + [F G] 5 2 |
| 140 | + [C D E] 2 3 |
| 141 | +
|
| 142 | + Sorting by index, then decreasing token count: |
| 143 | +
|
| 144 | + TextToken index length ID (for later reference) |
| 145 | + ---- ----- ------ ------------------------ |
| 146 | + [A] 0 1 t1 |
| 147 | + [B C] 1 2 t2 |
| 148 | + [B] 1 1 t3 |
| 149 | + [C D E] 2 3 t4 |
| 150 | + [C] 2 1 t5 |
| 151 | + [D] 3 1 t6 |
| 152 | + [E F G H I] 4 5 t7 |
| 153 | + [E] 4 1 t8 |
| 154 | + [F G] 5 2 t9 |
| 155 | + [F] 5 1 t10 |
| 156 | + [G] 6 1 t11 |
| 157 | + [H] 7 1 t12 |
| 158 | + [I] 8 1 t13 |
| 159 | +
|
| 160 | + Starting at the bottom of the above list and |
| 161 | + working upwards: |
| 162 | +
|
| 163 | + - ID of [I] is written to index 8: [] [] [] [] [] [] [] [] [t13] |
| 164 | + - ID of [H] to index 7: [] [] [] [] [] [] [] [t12] [t13] |
| 165 | + - ... |
| 166 | + - [F G] to index 5 *and* 6: [] [] [] [] [] [t9] [t9] [t12] [t13] |
| 167 | + - [E] to index 4: [] [] [] [] [t8] [t9] [t9] [t12] [t13] |
| 168 | + - [E F G H I] to indexes 4-8: [] [] [] [] [t7] [t7] [t7] [t7] [t7] |
| 169 | + - ... etc |
| 170 | +
|
| 171 | + Using the TextItem IDs, the resulting array would be: |
| 172 | +
|
| 173 | + output array: [t1] [t2] [t2] [t4] [t4] [t7] [t7] [t7] [t7] |
| 174 | + [A] [B C] [-D E] [-F G H I] |
| 175 | +
|
| 176 | + The only TextItems that will be shown are therefore: |
| 177 | + t1, t2, t3, t7 |
| 178 | +
|
| 179 | + To calculate what text is actually displayed, the count |
| 180 | + of each ID is used. e.g.: |
| 181 | + - ID t7 appears 4 times in the output array. The last 4 tokens of |
| 182 | + [E F G H I] are [F G H I], which will be used as t7's display text. |
| 183 | + - ID t2 appears 2 times. The last 2 tokens of [B C] are [B C], |
| 184 | + so that will be the display text. etc. |
| 185 | + """ |
| 186 | + # pylint: disable=too-many-locals |
| 187 | + |
| 188 | + new_unknown_terms = _create_missing_status_0_terms(tokens, terms, language) |
| 189 | + |
| 190 | + all_terms = terms + new_unknown_terms |
| 191 | + |
| 192 | + text_to_term = {dt.text_lc: dt for dt in all_terms} |
| 193 | + |
| 194 | + tokens_lc = [language.parser.get_lowercase(t.token) for t in tokens] |
| 195 | + |
| 196 | + textitems = [] |
| 197 | + |
| 198 | + def _add_textitem(index, text_lc): |
| 199 | + "Add a TextItem for position index in tokens." |
| 200 | + count = text_lc.count(zws) + 1 |
| 201 | + text_orig = zws.join([t.token for t in tokens[index : index + count]]) |
| 202 | + text_lc = zws.join(tokens_lc[index : index + count]) |
| 203 | + sentence_number = tokens[index].sentence_number |
| 204 | + term = text_to_term.get(text_lc, None) |
| 205 | + ti = _make_textitem(index, text_orig, text_lc, sentence_number, term) |
| 206 | + textitems.append(ti) |
| 207 | + |
| 208 | + # Single-word terms. |
| 209 | + for index, _ in enumerate(tokens): |
| 210 | + _add_textitem(index, tokens_lc[index]) |
| 211 | + |
| 212 | + # Multiword terms. |
| 213 | + multiword_terms = [t.text_lc for t in all_terms if t.token_count > 1] |
| 214 | + for e in get_string_indexes(multiword_terms, zws.join(tokens_lc)): |
| 215 | + _add_textitem(e[1], e[0]) |
| 216 | + |
| 217 | + # Sorting by index, then decreasing token count. |
| 218 | + textitems = sorted(textitems, key=lambda x: (x.index, -x.token_count)) |
| 219 | + |
| 220 | + # "Write out" TextItems to the output array. |
| 221 | + output_textitem_ids = [None] * len(tokens) |
| 222 | + for ti in reversed(textitems): |
| 223 | + for c in range(ti.index, ti.index + ti.token_count): |
| 224 | + output_textitem_ids[c] = id(ti) |
| 225 | + |
| 226 | + # Calc display_counts; e.g. if a textitem's id shows up 3 times |
| 227 | + # in the output_textitem_ids, it should display 3 tokens. |
| 228 | + for ti in textitems: |
| 229 | + ti.display_count = output_textitem_ids.count(id(ti)) |
| 230 | + |
| 231 | + textitems = [ti for ti in textitems if ti.display_count > 0] |
| 232 | + |
| 233 | + current_paragraph = 0 |
| 234 | + for ti in textitems: |
| 235 | + ti.paragraph_number = current_paragraph |
| 236 | + if ti.text == "¶": |
| 237 | + current_paragraph += 1 |
| 238 | + |
| 239 | + return textitems |
0 commit comments