Skip to content

Commit c4f0594

Browse files
committed
Merge branch 'bookstats_improvements' into develop
2 parents f1a22db + df9d40b commit c4f0594

File tree

12 files changed

+365
-183
lines changed

12 files changed

+365
-183
lines changed

lute/app_factory.py

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -139,19 +139,13 @@ def index():
139139
if is_production and have_books and should_run_auto_backup:
140140
return redirect("/backup/backup", 302)
141141

142-
refresh_stats()
143142
warning_msg = backupservice.backup_warning(bkp_settings)
144143
backup_show_warning = (
145144
bkp_settings.backup_warn
146145
and bkp_settings.backup_enabled
147146
and warning_msg != ""
148147
)
149148

150-
# Disabling caching on this page so that book stats
151-
# are recalculated, even if the user hits the browser
152-
# "back" button after updating some terms.
153-
# ref https://stackoverflow.com/questions/28627324/
154-
# disable-cache-on-a-specific-page-using-flask
155149
response = make_response(
156150
render_template(
157151
"index.html",
@@ -164,26 +158,18 @@ def index():
164158
language_choices=language_choices,
165159
current_language_id=current_language_id,
166160
is_production_data=is_production,
167-
# Backup stats
168161
backup_show_warning=backup_show_warning,
169162
backup_warning_msg=warning_msg,
170163
)
171164
)
172-
cc = "no-cache, no-store, must-revalidate, public, max-age=0"
173-
response.headers["Cache-Control"] = cc
174-
response.headers["Pragma"] = "no-cache"
175-
response.headers["Expires"] = "0"
176165
return response
177166

178167
@app.route("/refresh_all_stats")
179168
def refresh_all_stats():
180169
books_to_update = db.session.query(Book).filter(Book.archived == 0).all()
181-
182170
for book in books_to_update:
183171
mark_stale(book)
184-
185172
refresh_stats()
186-
187173
return redirect("/", 302)
188174

189175
@app.route("/wipe_database")

lute/book/routes.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
from lute.book import service
1616
from lute.book.datatables import get_data_tables_list
1717
from lute.book.forms import NewBookForm, EditBookForm
18+
from lute.book.stats import get_stats
1819
import lute.utils.formutils
1920
from lute.db import db
2021

@@ -188,3 +189,17 @@ def delete(bookid):
188189
db.session.delete(b)
189190
db.session.commit()
190191
return redirect("/", 302)
192+
193+
194+
@bp.route("/table_stats/<int:bookid>", methods=["GET"])
195+
def table_stats(bookid):
196+
"Get the stats, return ajax."
197+
b = DBBook.find(bookid)
198+
stats = get_stats(b)
199+
ret = {
200+
"distinctterms": stats.distinctterms,
201+
"distinctunknowns": stats.distinctunknowns,
202+
"unknownpercent": stats.unknownpercent,
203+
"status_distribution": stats.status_distribution,
204+
}
205+
return jsonify(ret)

lute/book/stats.py

Lines changed: 45 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
"""
44

55
import json
6-
from lute.read.render.service import get_paragraphs
6+
from lute.read.render.service import get_multiword_indexer, get_textitems
77
from lute.db import db
88
from lute.models.book import Book
99
from lute.models.setting import UserSetting
@@ -19,51 +19,41 @@ def _last_n_pages(book, txindex, n):
1919
return texts[-n:]
2020

2121

22-
def get_status_distribution(book):
22+
def calc_status_distribution(book):
2323
"""
24-
Return statuses and count of unique words per status.
24+
Calculate statuses and count of unique words per status.
2525
2626
Does a full render of a small number of pages
2727
to calculate the distribution.
2828
"""
29-
txindex = 0
3029

31-
# dt = DebugTimer("get_status_distribution", display=True)
30+
# DebugTimer.clear_total_summary()
31+
# dt = DebugTimer("get_status_distribution", display=False)
3232

33+
txindex = 0
3334
if (book.current_tx_id or 0) != 0:
3435
for t in book.texts:
3536
if t.id == book.current_tx_id:
3637
break
3738
txindex += 1
3839

39-
# Use a sample of pages to speed up stats count.
4040
sample_size = int(UserSetting.get_value("stats_calc_sample_size") or 5)
4141
texts = _last_n_pages(book, txindex, sample_size)
4242

4343
# Getting the individual paragraphs per page, and then combining,
4444
# is much faster than combining all pages into one giant page.
45-
paras = [get_paragraphs(t.text, book.language) for t in texts]
45+
mw = get_multiword_indexer(book.language)
46+
textitems = []
47+
for tx in texts:
48+
textitems.extend(get_textitems(tx.text, book.language, mw))
4649
# # Old slower code:
4750
# text_sample = "\n".join([t.text for t in texts])
48-
# paras = get_paragraphs(text_sample, book.language)
49-
51+
# paras = get_paragraphs(text_sample, book.language) ... etc.
5052
# dt.step("get_paragraphs")
51-
# DebugTimer.total_summary()
52-
53-
def flatten_list(nested_list):
54-
result = []
55-
for item in nested_list:
56-
if isinstance(item, list):
57-
result.extend(flatten_list(item))
58-
else:
59-
result.append(item)
60-
return result
61-
62-
text_items = [ti for ti in flatten_list(paras) if ti.is_word]
6353

54+
textitems = [ti for ti in textitems if ti.is_word]
6455
statterms = {0: [], 1: [], 2: [], 3: [], 4: [], 5: [], 98: [], 99: []}
65-
66-
for ti in text_items:
56+
for ti in textitems:
6757
statterms[ti.wo_status or 0].append(ti.text_lc)
6858

6959
stats = {}
@@ -72,6 +62,9 @@ def flatten_list(nested_list):
7262
statterms[statusval] = uniques
7363
stats[statusval] = len(uniques)
7464

65+
# dt.step("compiled")
66+
# DebugTimer.total_summary()
67+
7568
return stats
7669

7770

@@ -83,8 +76,7 @@ class BookStats(db.Model):
8376
"The stats table."
8477
__tablename__ = "bookstats"
8578

86-
id = db.Column(db.Integer, primary_key=True)
87-
BkID = db.Column(db.Integer)
79+
BkID = db.Column(db.Integer, primary_key=True)
8880
distinctterms = db.Column(db.Integer)
8981
distinctunknowns = db.Column(db.Integer)
9082
unknownpercent = db.Column(db.Integer)
@@ -100,7 +92,7 @@ def refresh_stats():
10092
)
10193
books = [b for b in books_to_update if b.is_supported]
10294
for book in books:
103-
stats = _get_stats(book)
95+
stats = _calculate_stats(book)
10496
_update_stats(book, stats)
10597

10698

@@ -111,31 +103,43 @@ def mark_stale(book):
111103
db.session.commit()
112104

113105

114-
def _get_stats(book):
106+
def get_stats(book):
107+
"Gets stats from the cache if available, or calculates."
108+
bk_id = book.id
109+
stats = db.session.query(BookStats).filter_by(BkID=bk_id).first()
110+
if stats is None:
111+
newstats = _calculate_stats(book)
112+
_update_stats(book, newstats)
113+
stats = db.session.query(BookStats).filter_by(BkID=bk_id).first()
114+
return stats
115+
116+
117+
def _calculate_stats(book):
115118
"Calc stats for the book using the status distribution."
116-
status_distribution = get_status_distribution(book)
119+
status_distribution = calc_status_distribution(book)
117120
unknowns = status_distribution[0]
118121
allunique = sum(status_distribution.values())
119122

120123
percent = 0
121124
if allunique > 0: # In case not parsed.
122125
percent = round(100.0 * unknowns / allunique)
123126

124-
sd = json.dumps(status_distribution)
125-
126-
# Any change in the below fields requires a change to
127-
# update_stats as well, query insert doesn't check field order.
128-
return [allunique, unknowns, percent, sd]
127+
return {
128+
"allunique": allunique,
129+
"unknowns": unknowns,
130+
"percent": percent,
131+
"distribution": json.dumps(status_distribution),
132+
}
129133

130134

131135
def _update_stats(book, stats):
132136
"Update BookStats for the given book."
133-
new_stats = BookStats(
134-
BkID=book.id,
135-
distinctterms=stats[0],
136-
distinctunknowns=stats[1],
137-
unknownpercent=stats[2],
138-
status_distribution=stats[3],
139-
)
140-
db.session.add(new_stats)
137+
s = db.session.query(BookStats).filter_by(BkID=book.id).first()
138+
if s is None:
139+
s = BookStats(BkID=book.id)
140+
s.distinctterms = stats["allunique"]
141+
s.distinctunknowns = stats["unknowns"]
142+
s.unknownpercent = stats["percent"]
143+
s.status_distribution = stats["distribution"]
144+
db.session.add(s)
141145
db.session.commit()

lute/read/render/calculate_textitems.py

Lines changed: 36 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,12 @@
2020
"""
2121

2222
import re
23+
from collections import Counter
2324
from lute.models.term import Term
2425
from lute.read.render.text_item import TextItem
2526

27+
# from lute.utils.debug_helpers import DebugTimer
28+
2629
zws = "\u200B" # zero-width space
2730

2831

@@ -55,13 +58,14 @@ def get_string_indexes(strings, content):
5558
return ret
5659

5760

58-
def _make_textitem(index, text, text_lc, sentence_number, term):
61+
# pylint: disable=too-many-arguments
62+
def _make_textitem(index, text, text_lc, count, sentence_number, term):
5963
"Make a TextItem."
6064
r = TextItem()
6165
r.text = text
6266
r.sentence_number = sentence_number
6367
r.text_lc = text_lc
64-
r.token_count = text.count(zws) + 1
68+
r.token_count = count
6569
r.display_count = r.token_count
6670
r.index = index
6771
r.is_word = term is not None
@@ -91,7 +95,7 @@ def _create_missing_status_0_terms(tokens, terms, language):
9195
return new_terms
9296

9397

94-
def get_textitems(tokens, terms, language):
98+
def get_textitems(tokens, terms, language, multiword_term_indexer=None):
9599
"""
96100
Return TextItems that will **actually be rendered**.
97101
@@ -185,34 +189,48 @@ def get_textitems(tokens, terms, language):
185189
"""
186190
# pylint: disable=too-many-locals
187191

192+
# dt = DebugTimer("get_textitems", display=False)
193+
188194
new_unknown_terms = _create_missing_status_0_terms(tokens, terms, language)
195+
# dt.step("new_unknown_terms")
189196

190197
all_terms = terms + new_unknown_terms
191-
192198
text_to_term = {dt.text_lc: dt for dt in all_terms}
193199

194-
tokens_lc = [language.parser.get_lowercase(t.token) for t in tokens]
200+
tokens_orig = [t.token for t in tokens]
201+
tokens_lc = [language.parser.get_lowercase(t) for t in tokens_orig]
195202

196203
textitems = []
197204

198-
def _add_textitem(index, text_lc):
205+
def _add_textitem(index, text_lc, count):
199206
"Add a TextItem for position index in tokens."
200-
count = text_lc.count(zws) + 1
201-
text_orig = zws.join([t.token for t in tokens[index : index + count]])
207+
text_orig = tokens_orig[index]
208+
if count > 1:
209+
text_orig = zws.join(tokens_orig[index : index + count])
202210
text_lc = zws.join(tokens_lc[index : index + count])
203211
sentence_number = tokens[index].sentence_number
204212
term = text_to_term.get(text_lc, None)
205-
ti = _make_textitem(index, text_orig, text_lc, sentence_number, term)
213+
ti = _make_textitem(index, text_orig, text_lc, count, sentence_number, term)
206214
textitems.append(ti)
207215

208216
# Single-word terms.
209217
for index, _ in enumerate(tokens):
210-
_add_textitem(index, tokens_lc[index])
218+
_add_textitem(index, tokens_lc[index], 1)
219+
# dt.step("single word textitems")
211220

212221
# Multiword terms.
213-
multiword_terms = [t.text_lc for t in all_terms if t.token_count > 1]
214-
for e in get_string_indexes(multiword_terms, zws.join(tokens_lc)):
215-
_add_textitem(e[1], e[0])
222+
if multiword_term_indexer is not None:
223+
for r in multiword_term_indexer.search_all(tokens_lc):
224+
mwt = text_to_term[r[0]]
225+
count = mwt.token_count
226+
_add_textitem(r[1], r[0], count)
227+
# dt.step(f"get mw textitems w indexer")
228+
else:
229+
multiword_terms = [t.text_lc for t in all_terms if t.token_count > 1]
230+
for e in get_string_indexes(multiword_terms, zws.join(tokens_lc)):
231+
count = e[0].count(zws) + 1
232+
_add_textitem(e[1], e[0], count)
233+
# dt.step("mw textitems without indexer")
216234

217235
# Sorting by index, then decreasing token count.
218236
textitems = sorted(textitems, key=lambda x: (x.index, -x.token_count))
@@ -225,8 +243,10 @@ def _add_textitem(index, text_lc):
225243

226244
# Calc display_counts; e.g. if a textitem's id shows up 3 times
227245
# in the output_textitem_ids, it should display 3 tokens.
246+
id_counts = dict(Counter(output_textitem_ids))
228247
for ti in textitems:
229-
ti.display_count = output_textitem_ids.count(id(ti))
248+
ti.display_count = id_counts.get(id(ti), 0)
249+
# dt.step("display_count")
230250

231251
textitems = [ti for ti in textitems if ti.display_count > 0]
232252

@@ -235,5 +255,7 @@ def _add_textitem(index, text_lc):
235255
ti.paragraph_number = current_paragraph
236256
if ti.text == "¶":
237257
current_paragraph += 1
258+
# dt.step("paragraphs")
259+
# dt.step("done")
238260

239261
return textitems

lute/read/render/multiword_indexer.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
"""
2+
Find terms in contest string using ahocorapy.
3+
"""
4+
5+
from ahocorapy.keywordtree import KeywordTree
6+
7+
8+
class MultiwordTermIndexer:
9+
"""
10+
Find terms in strings using ahocorapy.
11+
"""
12+
13+
zws = "\u200B" # zero-width space
14+
15+
def __init__(self):
16+
self.kwtree = KeywordTree(case_insensitive=True)
17+
self.finalized = False
18+
19+
def add(self, t):
20+
"Add zws-enclosed term to tree."
21+
add_t = f"{self.zws}{t}{self.zws}"
22+
self.kwtree.add(add_t)
23+
24+
def search_all(self, lc_tokens):
25+
"Find all terms and starting token index."
26+
if not self.finalized:
27+
self.kwtree.finalize()
28+
self.finalized = True
29+
30+
zws = self.zws
31+
content = zws + zws.join(lc_tokens) + zws
32+
zwsindexes = [i for i, char in enumerate(content) if char == zws]
33+
results = self.kwtree.search_all(content)
34+
35+
for result in results:
36+
# print(f"{result}\n", flush=True)
37+
t = result[0].strip(zws)
38+
charpos = result[1]
39+
index = zwsindexes.index(charpos)
40+
yield (t, index)

0 commit comments

Comments
 (0)