Skip to content

Commit f1a22db

Browse files
committed
Simplify rendering calculations.
1 parent 145c3ba commit f1a22db

18 files changed

+506
-725
lines changed

lute/book/stats.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -59,10 +59,7 @@ def flatten_list(nested_list):
5959
result.append(item)
6060
return result
6161

62-
text_items = []
63-
for s in flatten_list(paras):
64-
text_items.extend(s.textitems)
65-
text_items = [ti for ti in text_items if ti.is_word]
62+
text_items = [ti for ti in flatten_list(paras) if ti.is_word]
6663

6764
statterms = {0: [], 1: [], 2: [], 3: [], 4: [], 5: [], 98: [], 99: []}
6865

lute/cli/language_term_export.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ def _process_book(b, terms):
6363
ti.term
6464
for para in paragraphs
6565
for sentence in para
66-
for ti in sentence.textitems
66+
for ti in sentence
6767
if ti.is_word and ti.term is not None
6868
]
6969
for t in displayed_terms:
Lines changed: 239 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,239 @@
1+
"""
2+
Given text and Terms, determine what to render in the browser.
3+
4+
For example, given the following TextTokens A-I:
5+
6+
A B C D E F G H I
7+
8+
And the following terms:
9+
10+
"A" through "I" (single-word terms)
11+
"B C" (term J)
12+
"E F G H I" (K)
13+
"F G" (L)
14+
"C D E" (M)
15+
16+
The following TextItems would be displayed on the reading screen,
17+
with some of the Terms overlapping:
18+
19+
[A][B C][-D E][-F G H I]
20+
"""
21+
22+
import re
23+
from lute.models.term import Term
24+
from lute.read.render.text_item import TextItem
25+
26+
zws = "\u200B" # zero-width space
27+
28+
29+
def get_string_indexes(strings, content):
30+
"""
31+
Returns list of arrays: [[string, index], ...]
32+
33+
e.g., _get_string_indexes(["is a", "cat"], "here is a cat")
34+
returns [("is a", 1), ("cat", 3)].
35+
36+
strings and content must be lowercased!
37+
"""
38+
searchcontent = zws + content + zws
39+
zwsindexes = [index for index, letter in enumerate(searchcontent) if letter == zws]
40+
41+
ret = []
42+
43+
for s in strings:
44+
# "(?=())" is required because sometimes the search pattern can
45+
# overlap -- e.g. _b_b_ has _b_ *twice*.
46+
# https://stackoverflow.com/questions/5616822/
47+
# how-to-use-regex-to-find-all-overlapping-matches
48+
pattern = rf"(?=({re.escape(zws + s + zws)}))"
49+
add_matches = [
50+
(s, zwsindexes.index(m.start()))
51+
for m in re.finditer(pattern, searchcontent)
52+
]
53+
ret.extend(add_matches)
54+
55+
return ret
56+
57+
58+
def _make_textitem(index, text, text_lc, sentence_number, term):
59+
"Make a TextItem."
60+
r = TextItem()
61+
r.text = text
62+
r.sentence_number = sentence_number
63+
r.text_lc = text_lc
64+
r.token_count = text.count(zws) + 1
65+
r.display_count = r.token_count
66+
r.index = index
67+
r.is_word = term is not None
68+
r.term = term
69+
return r
70+
71+
72+
def _create_missing_status_0_terms(tokens, terms, language):
73+
"Make new terms as needed for all tokens, using case of last instance."
74+
75+
original_word_tokens = {t.token for t in tokens if t.is_word}
76+
parser = language.parser
77+
lc_word_tokens = {parser.get_lowercase(t): t for t in original_word_tokens}
78+
term_text_lcs = {t.text_lc for t in terms}
79+
80+
missing_word_tokens = [
81+
original for lc, original in lc_word_tokens.items() if lc not in term_text_lcs
82+
]
83+
84+
# Note: create the terms _without parsing_ because some parsers
85+
# break up characters when the words are given out of context.
86+
missing_word_tokens = list(set(missing_word_tokens))
87+
new_terms = [Term.create_term_no_parsing(language, t) for t in missing_word_tokens]
88+
for t in new_terms:
89+
t.status = 0
90+
91+
return new_terms
92+
93+
94+
def get_textitems(tokens, terms, language):
95+
"""
96+
Return TextItems that will **actually be rendered**.
97+
98+
Method to determine what should be rendered:
99+
100+
- Create TextItems for all of the tokens, finding their
101+
starting index in the tokens.
102+
103+
- "Write" the TextItems to an array in correctly sorted
104+
order, so that the correct TextItems take precendence
105+
in the final rendering.
106+
107+
- Calculate any term overlaps.
108+
109+
- Return the final list of TextItems that will actually
110+
be rendered.
111+
112+
---
113+
114+
Applying the above algorithm to the example given in the class
115+
header:
116+
117+
We have the following TextTokens A-I:
118+
119+
A B C D E F G H I
120+
121+
And given the following terms:
122+
"A" through "I" (single-word terms)
123+
"B C" (term J)
124+
"E F G H I" (K)
125+
"F G" (L)
126+
"C D E" (M)
127+
128+
Creating TextItems for all of the terms, finding their starting
129+
indices in the tokens:
130+
131+
TextToken index length
132+
---- ----- ------
133+
[A] 0 1
134+
[B] 1 1
135+
...
136+
[I] 8 1
137+
[B C] 1 2
138+
[E F G H I] 4 5
139+
[F G] 5 2
140+
[C D E] 2 3
141+
142+
Sorting by index, then decreasing token count:
143+
144+
TextToken index length ID (for later reference)
145+
---- ----- ------ ------------------------
146+
[A] 0 1 t1
147+
[B C] 1 2 t2
148+
[B] 1 1 t3
149+
[C D E] 2 3 t4
150+
[C] 2 1 t5
151+
[D] 3 1 t6
152+
[E F G H I] 4 5 t7
153+
[E] 4 1 t8
154+
[F G] 5 2 t9
155+
[F] 5 1 t10
156+
[G] 6 1 t11
157+
[H] 7 1 t12
158+
[I] 8 1 t13
159+
160+
Starting at the bottom of the above list and
161+
working upwards:
162+
163+
- ID of [I] is written to index 8: [] [] [] [] [] [] [] [] [t13]
164+
- ID of [H] to index 7: [] [] [] [] [] [] [] [t12] [t13]
165+
- ...
166+
- [F G] to index 5 *and* 6: [] [] [] [] [] [t9] [t9] [t12] [t13]
167+
- [E] to index 4: [] [] [] [] [t8] [t9] [t9] [t12] [t13]
168+
- [E F G H I] to indexes 4-8: [] [] [] [] [t7] [t7] [t7] [t7] [t7]
169+
- ... etc
170+
171+
Using the TextItem IDs, the resulting array would be:
172+
173+
output array: [t1] [t2] [t2] [t4] [t4] [t7] [t7] [t7] [t7]
174+
[A] [B C] [-D E] [-F G H I]
175+
176+
The only TextItems that will be shown are therefore:
177+
t1, t2, t3, t7
178+
179+
To calculate what text is actually displayed, the count
180+
of each ID is used. e.g.:
181+
- ID t7 appears 4 times in the output array. The last 4 tokens of
182+
[E F G H I] are [F G H I], which will be used as t7's display text.
183+
- ID t2 appears 2 times. The last 2 tokens of [B C] are [B C],
184+
so that will be the display text. etc.
185+
"""
186+
# pylint: disable=too-many-locals
187+
188+
new_unknown_terms = _create_missing_status_0_terms(tokens, terms, language)
189+
190+
all_terms = terms + new_unknown_terms
191+
192+
text_to_term = {dt.text_lc: dt for dt in all_terms}
193+
194+
tokens_lc = [language.parser.get_lowercase(t.token) for t in tokens]
195+
196+
textitems = []
197+
198+
def _add_textitem(index, text_lc):
199+
"Add a TextItem for position index in tokens."
200+
count = text_lc.count(zws) + 1
201+
text_orig = zws.join([t.token for t in tokens[index : index + count]])
202+
text_lc = zws.join(tokens_lc[index : index + count])
203+
sentence_number = tokens[index].sentence_number
204+
term = text_to_term.get(text_lc, None)
205+
ti = _make_textitem(index, text_orig, text_lc, sentence_number, term)
206+
textitems.append(ti)
207+
208+
# Single-word terms.
209+
for index, _ in enumerate(tokens):
210+
_add_textitem(index, tokens_lc[index])
211+
212+
# Multiword terms.
213+
multiword_terms = [t.text_lc for t in all_terms if t.token_count > 1]
214+
for e in get_string_indexes(multiword_terms, zws.join(tokens_lc)):
215+
_add_textitem(e[1], e[0])
216+
217+
# Sorting by index, then decreasing token count.
218+
textitems = sorted(textitems, key=lambda x: (x.index, -x.token_count))
219+
220+
# "Write out" TextItems to the output array.
221+
output_textitem_ids = [None] * len(tokens)
222+
for ti in reversed(textitems):
223+
for c in range(ti.index, ti.index + ti.token_count):
224+
output_textitem_ids[c] = id(ti)
225+
226+
# Calc display_counts; e.g. if a textitem's id shows up 3 times
227+
# in the output_textitem_ids, it should display 3 tokens.
228+
for ti in textitems:
229+
ti.display_count = output_textitem_ids.count(id(ti))
230+
231+
textitems = [ti for ti in textitems if ti.display_count > 0]
232+
233+
current_paragraph = 0
234+
for ti in textitems:
235+
ti.paragraph_number = current_paragraph
236+
if ti.text == "¶":
237+
current_paragraph += 1
238+
239+
return textitems

0 commit comments

Comments
 (0)