Skip to content

Commit

Permalink
Merge branch 'bookstats_improvements' into develop
Browse files Browse the repository at this point in the history
  • Loading branch information
jzohrab committed Oct 5, 2024
2 parents f1a22db + df9d40b commit c4f0594
Show file tree
Hide file tree
Showing 12 changed files with 365 additions and 183 deletions.
14 changes: 0 additions & 14 deletions lute/app_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,19 +139,13 @@ def index():
if is_production and have_books and should_run_auto_backup:
return redirect("/backup/backup", 302)

refresh_stats()
warning_msg = backupservice.backup_warning(bkp_settings)
backup_show_warning = (
bkp_settings.backup_warn
and bkp_settings.backup_enabled
and warning_msg != ""
)

# Disabling caching on this page so that book stats
# are recalculated, even if the user hits the browser
# "back" button after updating some terms.
# ref https://stackoverflow.com/questions/28627324/
# disable-cache-on-a-specific-page-using-flask
response = make_response(
render_template(
"index.html",
Expand All @@ -164,26 +158,18 @@ def index():
language_choices=language_choices,
current_language_id=current_language_id,
is_production_data=is_production,
# Backup stats
backup_show_warning=backup_show_warning,
backup_warning_msg=warning_msg,
)
)
cc = "no-cache, no-store, must-revalidate, public, max-age=0"
response.headers["Cache-Control"] = cc
response.headers["Pragma"] = "no-cache"
response.headers["Expires"] = "0"
return response

@app.route("/refresh_all_stats")
def refresh_all_stats():
books_to_update = db.session.query(Book).filter(Book.archived == 0).all()

for book in books_to_update:
mark_stale(book)

refresh_stats()

return redirect("/", 302)

@app.route("/wipe_database")
Expand Down
15 changes: 15 additions & 0 deletions lute/book/routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from lute.book import service
from lute.book.datatables import get_data_tables_list
from lute.book.forms import NewBookForm, EditBookForm
from lute.book.stats import get_stats
import lute.utils.formutils
from lute.db import db

Expand Down Expand Up @@ -188,3 +189,17 @@ def delete(bookid):
db.session.delete(b)
db.session.commit()
return redirect("/", 302)


@bp.route("/table_stats/<int:bookid>", methods=["GET"])
def table_stats(bookid):
"Get the stats, return ajax."
b = DBBook.find(bookid)
stats = get_stats(b)
ret = {
"distinctterms": stats.distinctterms,
"distinctunknowns": stats.distinctunknowns,
"unknownpercent": stats.unknownpercent,
"status_distribution": stats.status_distribution,
}
return jsonify(ret)
86 changes: 45 additions & 41 deletions lute/book/stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"""

import json
from lute.read.render.service import get_paragraphs
from lute.read.render.service import get_multiword_indexer, get_textitems
from lute.db import db
from lute.models.book import Book
from lute.models.setting import UserSetting
Expand All @@ -19,51 +19,41 @@ def _last_n_pages(book, txindex, n):
return texts[-n:]


def get_status_distribution(book):
def calc_status_distribution(book):
"""
Return statuses and count of unique words per status.
Calculate statuses and count of unique words per status.
Does a full render of a small number of pages
to calculate the distribution.
"""
txindex = 0

# dt = DebugTimer("get_status_distribution", display=True)
# DebugTimer.clear_total_summary()
# dt = DebugTimer("get_status_distribution", display=False)

txindex = 0
if (book.current_tx_id or 0) != 0:
for t in book.texts:
if t.id == book.current_tx_id:
break
txindex += 1

# Use a sample of pages to speed up stats count.
sample_size = int(UserSetting.get_value("stats_calc_sample_size") or 5)
texts = _last_n_pages(book, txindex, sample_size)

# Getting the individual paragraphs per page, and then combining,
# is much faster than combining all pages into one giant page.
paras = [get_paragraphs(t.text, book.language) for t in texts]
mw = get_multiword_indexer(book.language)
textitems = []
for tx in texts:
textitems.extend(get_textitems(tx.text, book.language, mw))
# # Old slower code:
# text_sample = "\n".join([t.text for t in texts])
# paras = get_paragraphs(text_sample, book.language)

# paras = get_paragraphs(text_sample, book.language) ... etc.
# dt.step("get_paragraphs")
# DebugTimer.total_summary()

def flatten_list(nested_list):
result = []
for item in nested_list:
if isinstance(item, list):
result.extend(flatten_list(item))
else:
result.append(item)
return result

text_items = [ti for ti in flatten_list(paras) if ti.is_word]

textitems = [ti for ti in textitems if ti.is_word]
statterms = {0: [], 1: [], 2: [], 3: [], 4: [], 5: [], 98: [], 99: []}

for ti in text_items:
for ti in textitems:
statterms[ti.wo_status or 0].append(ti.text_lc)

stats = {}
Expand All @@ -72,6 +62,9 @@ def flatten_list(nested_list):
statterms[statusval] = uniques
stats[statusval] = len(uniques)

# dt.step("compiled")
# DebugTimer.total_summary()

return stats


Expand All @@ -83,8 +76,7 @@ class BookStats(db.Model):
"The stats table."
__tablename__ = "bookstats"

id = db.Column(db.Integer, primary_key=True)
BkID = db.Column(db.Integer)
BkID = db.Column(db.Integer, primary_key=True)
distinctterms = db.Column(db.Integer)
distinctunknowns = db.Column(db.Integer)
unknownpercent = db.Column(db.Integer)
Expand All @@ -100,7 +92,7 @@ def refresh_stats():
)
books = [b for b in books_to_update if b.is_supported]
for book in books:
stats = _get_stats(book)
stats = _calculate_stats(book)
_update_stats(book, stats)


Expand All @@ -111,31 +103,43 @@ def mark_stale(book):
db.session.commit()


def _get_stats(book):
def get_stats(book):
"Gets stats from the cache if available, or calculates."
bk_id = book.id
stats = db.session.query(BookStats).filter_by(BkID=bk_id).first()
if stats is None:
newstats = _calculate_stats(book)
_update_stats(book, newstats)
stats = db.session.query(BookStats).filter_by(BkID=bk_id).first()
return stats


def _calculate_stats(book):
"Calc stats for the book using the status distribution."
status_distribution = get_status_distribution(book)
status_distribution = calc_status_distribution(book)
unknowns = status_distribution[0]
allunique = sum(status_distribution.values())

percent = 0
if allunique > 0: # In case not parsed.
percent = round(100.0 * unknowns / allunique)

sd = json.dumps(status_distribution)

# Any change in the below fields requires a change to
# update_stats as well, query insert doesn't check field order.
return [allunique, unknowns, percent, sd]
return {
"allunique": allunique,
"unknowns": unknowns,
"percent": percent,
"distribution": json.dumps(status_distribution),
}


def _update_stats(book, stats):
"Update BookStats for the given book."
new_stats = BookStats(
BkID=book.id,
distinctterms=stats[0],
distinctunknowns=stats[1],
unknownpercent=stats[2],
status_distribution=stats[3],
)
db.session.add(new_stats)
s = db.session.query(BookStats).filter_by(BkID=book.id).first()
if s is None:
s = BookStats(BkID=book.id)
s.distinctterms = stats["allunique"]
s.distinctunknowns = stats["unknowns"]
s.unknownpercent = stats["percent"]
s.status_distribution = stats["distribution"]
db.session.add(s)
db.session.commit()
50 changes: 36 additions & 14 deletions lute/read/render/calculate_textitems.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,12 @@
"""

import re
from collections import Counter
from lute.models.term import Term
from lute.read.render.text_item import TextItem

# from lute.utils.debug_helpers import DebugTimer

zws = "\u200B" # zero-width space


Expand Down Expand Up @@ -55,13 +58,14 @@ def get_string_indexes(strings, content):
return ret


def _make_textitem(index, text, text_lc, sentence_number, term):
# pylint: disable=too-many-arguments
def _make_textitem(index, text, text_lc, count, sentence_number, term):
"Make a TextItem."
r = TextItem()
r.text = text
r.sentence_number = sentence_number
r.text_lc = text_lc
r.token_count = text.count(zws) + 1
r.token_count = count
r.display_count = r.token_count
r.index = index
r.is_word = term is not None
Expand Down Expand Up @@ -91,7 +95,7 @@ def _create_missing_status_0_terms(tokens, terms, language):
return new_terms


def get_textitems(tokens, terms, language):
def get_textitems(tokens, terms, language, multiword_term_indexer=None):
"""
Return TextItems that will **actually be rendered**.
Expand Down Expand Up @@ -185,34 +189,48 @@ def get_textitems(tokens, terms, language):
"""
# pylint: disable=too-many-locals

# dt = DebugTimer("get_textitems", display=False)

new_unknown_terms = _create_missing_status_0_terms(tokens, terms, language)
# dt.step("new_unknown_terms")

all_terms = terms + new_unknown_terms

text_to_term = {dt.text_lc: dt for dt in all_terms}

tokens_lc = [language.parser.get_lowercase(t.token) for t in tokens]
tokens_orig = [t.token for t in tokens]
tokens_lc = [language.parser.get_lowercase(t) for t in tokens_orig]

textitems = []

def _add_textitem(index, text_lc):
def _add_textitem(index, text_lc, count):
"Add a TextItem for position index in tokens."
count = text_lc.count(zws) + 1
text_orig = zws.join([t.token for t in tokens[index : index + count]])
text_orig = tokens_orig[index]
if count > 1:
text_orig = zws.join(tokens_orig[index : index + count])
text_lc = zws.join(tokens_lc[index : index + count])
sentence_number = tokens[index].sentence_number
term = text_to_term.get(text_lc, None)
ti = _make_textitem(index, text_orig, text_lc, sentence_number, term)
ti = _make_textitem(index, text_orig, text_lc, count, sentence_number, term)
textitems.append(ti)

# Single-word terms.
for index, _ in enumerate(tokens):
_add_textitem(index, tokens_lc[index])
_add_textitem(index, tokens_lc[index], 1)
# dt.step("single word textitems")

# Multiword terms.
multiword_terms = [t.text_lc for t in all_terms if t.token_count > 1]
for e in get_string_indexes(multiword_terms, zws.join(tokens_lc)):
_add_textitem(e[1], e[0])
if multiword_term_indexer is not None:
for r in multiword_term_indexer.search_all(tokens_lc):
mwt = text_to_term[r[0]]
count = mwt.token_count
_add_textitem(r[1], r[0], count)
# dt.step(f"get mw textitems w indexer")
else:
multiword_terms = [t.text_lc for t in all_terms if t.token_count > 1]
for e in get_string_indexes(multiword_terms, zws.join(tokens_lc)):
count = e[0].count(zws) + 1
_add_textitem(e[1], e[0], count)
# dt.step("mw textitems without indexer")

# Sorting by index, then decreasing token count.
textitems = sorted(textitems, key=lambda x: (x.index, -x.token_count))
Expand All @@ -225,8 +243,10 @@ def _add_textitem(index, text_lc):

# Calc display_counts; e.g. if a textitem's id shows up 3 times
# in the output_textitem_ids, it should display 3 tokens.
id_counts = dict(Counter(output_textitem_ids))
for ti in textitems:
ti.display_count = output_textitem_ids.count(id(ti))
ti.display_count = id_counts.get(id(ti), 0)
# dt.step("display_count")

textitems = [ti for ti in textitems if ti.display_count > 0]

Expand All @@ -235,5 +255,7 @@ def _add_textitem(index, text_lc):
ti.paragraph_number = current_paragraph
if ti.text == "¶":
current_paragraph += 1
# dt.step("paragraphs")
# dt.step("done")

return textitems
40 changes: 40 additions & 0 deletions lute/read/render/multiword_indexer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
"""
Find terms in contest string using ahocorapy.
"""

from ahocorapy.keywordtree import KeywordTree


class MultiwordTermIndexer:
"""
Find terms in strings using ahocorapy.
"""

zws = "\u200B" # zero-width space

def __init__(self):
self.kwtree = KeywordTree(case_insensitive=True)
self.finalized = False

def add(self, t):
"Add zws-enclosed term to tree."
add_t = f"{self.zws}{t}{self.zws}"
self.kwtree.add(add_t)

def search_all(self, lc_tokens):
"Find all terms and starting token index."
if not self.finalized:
self.kwtree.finalize()
self.finalized = True

zws = self.zws
content = zws + zws.join(lc_tokens) + zws
zwsindexes = [i for i, char in enumerate(content) if char == zws]
results = self.kwtree.search_all(content)

for result in results:
# print(f"{result}\n", flush=True)
t = result[0].strip(zws)
charpos = result[1]
index = zwsindexes.index(charpos)
yield (t, index)
Loading

0 comments on commit c4f0594

Please sign in to comment.