From ed40ff07128caad6a241ebfd9dc6058143803dda Mon Sep 17 00:00:00 2001 From: Jeff Zohrab Date: Fri, 4 Oct 2024 16:00:25 -0700 Subject: [PATCH 1/5] Add indexer using ahocorapy. --- lute/read/render/multiword_indexer.py | 40 +++++++++++++ requirements.txt | 2 + .../read/render/test_multiword_indexer.py | 57 +++++++++++++++++++ 3 files changed, 99 insertions(+) create mode 100644 lute/read/render/multiword_indexer.py create mode 100644 tests/unit/read/render/test_multiword_indexer.py diff --git a/lute/read/render/multiword_indexer.py b/lute/read/render/multiword_indexer.py new file mode 100644 index 000000000..dbbc80087 --- /dev/null +++ b/lute/read/render/multiword_indexer.py @@ -0,0 +1,40 @@ +""" +Find terms in contest string using ahocorapy. +""" + +from ahocorapy.keywordtree import KeywordTree + + +class MultiwordTermIndexer: + """ + Find terms in strings using ahocorapy. + """ + + zws = "\u200B" # zero-width space + + def __init__(self): + self.kwtree = KeywordTree(case_insensitive=True) + self.finalized = False + + def add(self, t): + "Add zws-enclosed term to tree." + add_t = f"{self.zws}{t}{self.zws}" + self.kwtree.add(add_t) + + def search_all(self, lc_tokens): + "Find all terms and starting token index." + if not self.finalized: + self.kwtree.finalize() + self.finalized = True + + zws = self.zws + content = zws + zws.join(lc_tokens) + zws + zwsindexes = [i for i, char in enumerate(content) if char == zws] + results = self.kwtree.search_all(content) + + for result in results: + # print(f"{result}\n", flush=True) + t = result[0].strip(zws) + charpos = result[1] + index = zwsindexes.index(charpos) + yield (t, index) diff --git a/requirements.txt b/requirements.txt index 3109b4361..06604d4bf 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ +ahocorapy==1.6.2 astroid==2.15.6 attrs==23.1.0 beautifulsoup4==4.12.2 @@ -20,6 +21,7 @@ Flask-SQLAlchemy==3.1.1 Flask-WTF==1.2.1 flit==3.9.0 flit_core==3.9.0 +future==1.0.0 greenlet==3.0.0 h11==0.14.0 identify==2.5.31 diff --git a/tests/unit/read/render/test_multiword_indexer.py b/tests/unit/read/render/test_multiword_indexer.py new file mode 100644 index 000000000..fefd8935c --- /dev/null +++ b/tests/unit/read/render/test_multiword_indexer.py @@ -0,0 +1,57 @@ +""" +get_string_indexes tests. +""" + +import pytest +from lute.read.render.multiword_indexer import MultiwordTermIndexer + +zws = "\u200B" # zero-width space + + +@pytest.mark.parametrize( + "name,terms,tokens,expected", + [ + ("empty", [], ["a"], []), + ("no terms", [], ["a"], []), + ("no tokens", ["a"], [], []), + ("no match", ["x"], ["a"], []), + ("single match", ["a"], ["a"], [("a", 0)]), + ("single match 2", ["a"], ["b", "a"], [("a", 1)]), + ("same term twice", ["a"], ["b", "a", "c", "a"], [("a", 1), ("a", 3)]), + ( + "multiple terms", + ["a", "b"], + ["b", "a", "c", "a"], + [("b", 0), ("a", 1), ("a", 3)], + ), + ("multi-word term", [f"a{zws}b"], ["b", "a", "b", "a"], [(f"a{zws}b", 1)]), + ( + "repeated m-word term", + [f"a{zws}a"], + ["a", "a", "a", "b"], + [(f"a{zws}a", 0), (f"a{zws}a", 1)], + ), + ("bound check term at end", ["a"], ["b", "c", "a"], [("a", 2)]), + ], +) +def test_scenario(name, terms, tokens, expected): + "Test scenario." + mw = MultiwordTermIndexer() + for t in terms: + mw.add(t) + results = list(mw.search_all(tokens)) + assert len(results) == len(expected), name + assert results == expected, name + + +def test_can_search_multiple_times_with_different_tokens(): + "Single match, returns token index." + mw = MultiwordTermIndexer() + mw.add("a") + results = list(mw.search_all(["a", "b"])) + assert len(results) == 1, "one match" + assert results[0] == ("a", 0) + + results = list(mw.search_all(["b", "a"])) + assert len(results) == 1, "one match" + assert results[0] == ("a", 1) From 049f643d716403871644bc6c40d687478ca47aac Mon Sep 17 00:00:00 2001 From: Jeff Zohrab Date: Fri, 4 Oct 2024 16:06:53 -0700 Subject: [PATCH 2/5] Use new indexer, reorg code. --- lute/book/stats.py | 84 +++++----- lute/read/render/calculate_textitems.py | 50 ++++-- lute/read/render/service.py | 195 +++++++++++++----------- tests/unit/book/test_stats.py | 37 ++++- 4 files changed, 218 insertions(+), 148 deletions(-) diff --git a/lute/book/stats.py b/lute/book/stats.py index c4c62ff50..27b620524 100644 --- a/lute/book/stats.py +++ b/lute/book/stats.py @@ -3,7 +3,7 @@ """ import json -from lute.read.render.service import get_paragraphs +from lute.read.render.service import get_multiword_indexer, get_textitems from lute.db import db from lute.models.book import Book from lute.models.setting import UserSetting @@ -19,16 +19,17 @@ def _last_n_pages(book, txindex, n): return texts[-n:] -def get_status_distribution(book): +def calc_status_distribution(book): """ - Return statuses and count of unique words per status. + Calculate statuses and count of unique words per status. Does a full render of a small number of pages to calculate the distribution. """ txindex = 0 - # dt = DebugTimer("get_status_distribution", display=True) + # DebugTimer.clear_total_summary() + # dt = DebugTimer("get_status_distribution", display=False) if (book.current_tx_id or 0) != 0: for t in book.texts: @@ -36,34 +37,25 @@ def get_status_distribution(book): break txindex += 1 - # Use a sample of pages to speed up stats count. sample_size = int(UserSetting.get_value("stats_calc_sample_size") or 5) texts = _last_n_pages(book, txindex, sample_size) # Getting the individual paragraphs per page, and then combining, # is much faster than combining all pages into one giant page. - paras = [get_paragraphs(t.text, book.language) for t in texts] + lang = book.language + mw = get_multiword_indexer(lang) + textitems = [] + for tx in texts: + add_tis = [ti for ti in get_textitems(tx.text, lang, mw) if ti.is_word] + textitems.extend(add_tis) # # Old slower code: # text_sample = "\n".join([t.text for t in texts]) - # paras = get_paragraphs(text_sample, book.language) - + # paras = get_paragraphs(text_sample, book.language) ... etc. # dt.step("get_paragraphs") - # DebugTimer.total_summary() - - def flatten_list(nested_list): - result = [] - for item in nested_list: - if isinstance(item, list): - result.extend(flatten_list(item)) - else: - result.append(item) - return result - - text_items = [ti for ti in flatten_list(paras) if ti.is_word] statterms = {0: [], 1: [], 2: [], 3: [], 4: [], 5: [], 98: [], 99: []} - for ti in text_items: + for ti in textitems: statterms[ti.wo_status or 0].append(ti.text_lc) stats = {} @@ -72,6 +64,9 @@ def flatten_list(nested_list): statterms[statusval] = uniques stats[statusval] = len(uniques) + # dt.step("compiled") + # DebugTimer.total_summary() + return stats @@ -83,8 +78,7 @@ class BookStats(db.Model): "The stats table." __tablename__ = "bookstats" - id = db.Column(db.Integer, primary_key=True) - BkID = db.Column(db.Integer) + BkID = db.Column(db.Integer, primary_key=True) distinctterms = db.Column(db.Integer) distinctunknowns = db.Column(db.Integer) unknownpercent = db.Column(db.Integer) @@ -100,7 +94,7 @@ def refresh_stats(): ) books = [b for b in books_to_update if b.is_supported] for book in books: - stats = _get_stats(book) + stats = _calculate_stats(book) _update_stats(book, stats) @@ -111,9 +105,20 @@ def mark_stale(book): db.session.commit() -def _get_stats(book): +def get_stats(book): + "Gets stats from the cache if available, or calculates." + bk_id = book.id + stats = db.session.query(BookStats).filter_by(BkID=bk_id).first() + if stats is None: + newstats = _calculate_stats(book) + _update_stats(book, newstats) + stats = db.session.query(BookStats).filter_by(BkID=bk_id).first() + return stats + + +def _calculate_stats(book): "Calc stats for the book using the status distribution." - status_distribution = get_status_distribution(book) + status_distribution = calc_status_distribution(book) unknowns = status_distribution[0] allunique = sum(status_distribution.values()) @@ -121,21 +126,22 @@ def _get_stats(book): if allunique > 0: # In case not parsed. percent = round(100.0 * unknowns / allunique) - sd = json.dumps(status_distribution) - - # Any change in the below fields requires a change to - # update_stats as well, query insert doesn't check field order. - return [allunique, unknowns, percent, sd] + return { + "allunique": allunique, + "unknowns": unknowns, + "percent": percent, + "distribution": json.dumps(status_distribution), + } def _update_stats(book, stats): "Update BookStats for the given book." - new_stats = BookStats( - BkID=book.id, - distinctterms=stats[0], - distinctunknowns=stats[1], - unknownpercent=stats[2], - status_distribution=stats[3], - ) - db.session.add(new_stats) + s = db.session.query(BookStats).filter_by(BkID=book.id).first() + if s is None: + s = BookStats(BkID=book.id) + s.distinctterms = stats["allunique"] + s.distinctunknowns = stats["unknowns"] + s.unknownpercent = stats["percent"] + s.status_distribution = stats["distribution"] + db.session.add(s) db.session.commit() diff --git a/lute/read/render/calculate_textitems.py b/lute/read/render/calculate_textitems.py index a9a7267ea..b902c6ec3 100644 --- a/lute/read/render/calculate_textitems.py +++ b/lute/read/render/calculate_textitems.py @@ -20,9 +20,12 @@ """ import re +from collections import Counter from lute.models.term import Term from lute.read.render.text_item import TextItem +# from lute.utils.debug_helpers import DebugTimer + zws = "\u200B" # zero-width space @@ -55,13 +58,14 @@ def get_string_indexes(strings, content): return ret -def _make_textitem(index, text, text_lc, sentence_number, term): +# pylint: disable=too-many-arguments +def _make_textitem(index, text, text_lc, count, sentence_number, term): "Make a TextItem." r = TextItem() r.text = text r.sentence_number = sentence_number r.text_lc = text_lc - r.token_count = text.count(zws) + 1 + r.token_count = count r.display_count = r.token_count r.index = index r.is_word = term is not None @@ -91,7 +95,7 @@ def _create_missing_status_0_terms(tokens, terms, language): return new_terms -def get_textitems(tokens, terms, language): +def get_textitems(tokens, terms, language, multiword_term_indexer=None): """ Return TextItems that will **actually be rendered**. @@ -185,34 +189,48 @@ def get_textitems(tokens, terms, language): """ # pylint: disable=too-many-locals + # dt = DebugTimer("get_textitems", display=False) + new_unknown_terms = _create_missing_status_0_terms(tokens, terms, language) + # dt.step("new_unknown_terms") all_terms = terms + new_unknown_terms - text_to_term = {dt.text_lc: dt for dt in all_terms} - tokens_lc = [language.parser.get_lowercase(t.token) for t in tokens] + tokens_orig = [t.token for t in tokens] + tokens_lc = [language.parser.get_lowercase(t) for t in tokens_orig] textitems = [] - def _add_textitem(index, text_lc): + def _add_textitem(index, text_lc, count): "Add a TextItem for position index in tokens." - count = text_lc.count(zws) + 1 - text_orig = zws.join([t.token for t in tokens[index : index + count]]) + text_orig = tokens_orig[index] + if count > 1: + text_orig = zws.join(tokens_orig[index : index + count]) text_lc = zws.join(tokens_lc[index : index + count]) sentence_number = tokens[index].sentence_number term = text_to_term.get(text_lc, None) - ti = _make_textitem(index, text_orig, text_lc, sentence_number, term) + ti = _make_textitem(index, text_orig, text_lc, count, sentence_number, term) textitems.append(ti) # Single-word terms. for index, _ in enumerate(tokens): - _add_textitem(index, tokens_lc[index]) + _add_textitem(index, tokens_lc[index], 1) + # dt.step("single word textitems") # Multiword terms. - multiword_terms = [t.text_lc for t in all_terms if t.token_count > 1] - for e in get_string_indexes(multiword_terms, zws.join(tokens_lc)): - _add_textitem(e[1], e[0]) + if multiword_term_indexer is not None: + for r in multiword_term_indexer.search_all(tokens_lc): + mwt = text_to_term[r[0]] + count = mwt.token_count + _add_textitem(r[1], r[0], count) + # dt.step(f"get mw textitems w indexer") + else: + multiword_terms = [t.text_lc for t in all_terms if t.token_count > 1] + for e in get_string_indexes(multiword_terms, zws.join(tokens_lc)): + count = e[0].count(zws) + 1 + _add_textitem(e[1], e[0], count) + # dt.step("mw textitems without indexer") # Sorting by index, then decreasing token count. textitems = sorted(textitems, key=lambda x: (x.index, -x.token_count)) @@ -225,8 +243,10 @@ def _add_textitem(index, text_lc): # Calc display_counts; e.g. if a textitem's id shows up 3 times # in the output_textitem_ids, it should display 3 tokens. + id_counts = dict(Counter(output_textitem_ids)) for ti in textitems: - ti.display_count = output_textitem_ids.count(id(ti)) + ti.display_count = id_counts.get(id(ti), 0) + # dt.step("display_count") textitems = [ti for ti in textitems if ti.display_count > 0] @@ -235,5 +255,7 @@ def _add_textitem(index, text_lc): ti.paragraph_number = current_paragraph if ti.text == "¶": current_paragraph += 1 + # dt.step("paragraphs") + # dt.step("done") return textitems diff --git a/lute/read/render/service.py b/lute/read/render/service.py index 849f32476..433a012b0 100644 --- a/lute/read/render/service.py +++ b/lute/read/render/service.py @@ -8,7 +8,8 @@ from lute.models.term import Term from lute.parse.base import ParsedToken -from lute.read.render.calculate_textitems import get_textitems +from lute.read.render.calculate_textitems import get_textitems as calc_get_textitems +from lute.read.render.multiword_indexer import MultiwordTermIndexer from lute.db import db # from lute.utils.debug_helpers import DebugTimer @@ -24,30 +25,13 @@ def find_all_Terms_in_string(s, language): # pylint: disable=too-many-locals This would return the terms "cat" and "a cat". """ - cleaned = re.sub(r" +", " ", s) tokens = language.get_parsed_tokens(cleaned) return _find_all_terms_in_tokens(tokens, language) -# TODO cache_multiword_terms. -# -# Caching all multiword terms cuts down stats calculation time. -# e.g. when calculating stats on 100 pages, the time goes from 0.7s to 0.01s. -# -# Have to sort out cache invalidation (esp for unit tests), and -# separate caches for each language. -# _cached_multiword_terms = None - - def _get_multiword_terms(language): "Get all multiword terms." - - # TODO cache_multiword_terms. - # global _cached_multiword_terms - # if _cached_multiword_terms is not None: - # return _cached_multiword_terms - sql = sqltext( """ SELECT WoID, WoTextLC FROM words @@ -55,11 +39,54 @@ def _get_multiword_terms(language): """ ) sql = sql.bindparams(language_id=language.id) - _cached_multiword_terms = db.session.execute(sql).all() - return _cached_multiword_terms + return db.session.execute(sql).all() + +def _find_all_multi_word_term_text_lcs_in_content(text_lcs, language): + "Find multiword terms, return list of text_lcs." -def _find_all_terms_in_tokens(tokens, language): + # There are a few ways of finding multi-word Terms + # (with token_count > 1) in the content: + # + # 1. load each mword term text_lc via sql and check. + # 2. using the model + # 3. SQL with "LIKE" + # + # During reasonable test runs with my data, the times in seconds + # for each are similar (~0.02, ~0.05, ~0.025). This method is + # only used for small amounts of data, and the user experience hit + # is negligible, so I'll use the first method which IMO is the clearest + # code. + + zws = "\u200B" # zero-width space + content = zws + zws.join(text_lcs) + zws + + # Method 1: + reclist = _get_multiword_terms(language) + return [p[1] for p in reclist if f"{zws}{p[1]}{zws}" in content] + + ## # Method 2: use the model. + ## contained_term_qry = db.session.query(Term).filter( + ## Term.language == language, + ## Term.token_count > 1, + ## func.instr(content, Term.text_lc) > 0, + ## ) + ## return [r.text_lc for r in contained_term_qry.all()] + + ## # Method 3: Query with LIKE + ## sql = sqltext( + ## """ + ## SELECT WoTextLC FROM words + ## WHERE WoLgID=:lid and WoTokenCount>1 + ## AND :content LIKE '%' || :zws || WoTextLC || :zws || '%' + ## """ + ## ) + ## sql = sql.bindparams(lid=language.id, content=content, zws=zws) + ## recs = db.session.execute(sql).all() + ## return [r[0] for r in recs] + + +def _find_all_terms_in_tokens(tokens, language, kwtree=None): """ Find all terms contained in the (ordered) parsed tokens tokens. @@ -73,84 +100,49 @@ def _find_all_terms_in_tokens(tokens, language): - build list of lowercase text in the tokens - append all multword term strings that exist in the content - query for Terms that exist in the list - """ - # Performance breakdown: - # - # About half of the time is spent in "performance hit 1", - # filtering the multiword terms to find those contained in the - # text. A bit less than half is spent is "performance hit 2", the - # actual query. - # - # Future performance improvement considerations: - # - # 1. I considered keeping a cache of multiword terms strings and - # IDs, but IMO the payoff isn't worth the extra complexity at this - # time. - # - # 2. Maybe a different search method like Aho-Corasick (ref - # https://github.com/abusix/ahocorapy) would be useful ... again - # it would imply that all keywords (existing Terms) are loaded - # into the Aho-Corasick automaton. This could be cached, but would - # again need methods for cache invalidation and reload etc. + Note: this method only uses indexes for multiword terms, as any + content analyzed is first parsed into tokens before being passed + to this routine. There's no need to search for single-word Terms + in the tokenized strings, they can be found by a simple query. + """ - # dt = DebugTimer("_find_all_terms_in_tokens", display=False) + # Performance: About half of the time in this routine is spent in + # Step 1 (finding multiword terms), the rest in step 2 (the actual + # query). + # dt = DebugTimer("_find_all_terms_in_tokens", display=True) parser = language.parser - - # Each token can map to a single-word Term. text_lcs = [parser.get_lowercase(t.token) for t in tokens] - # Multiword terms - # - # Multiword terms are harder to find as we have to do a full text - # match. - # - # The "obvious" method of using the model is quite slow: - # - # contained_term_qry = db.session.query(Term).filter( - # Term.language == language, - # Term.token_count > 1, - # func.instr(content, Term.text_lc) > 0, - # ) - # contained_terms = contained_term_qry.all() + # Step 1: get the multiwords in the content. + if kwtree is None: + mword_terms = _find_all_multi_word_term_text_lcs_in_content(text_lcs, language) + else: + results = kwtree.search_all(text_lcs) + mword_terms = [r[0] for r in results] + # dt.step("filtered mword terms") + + # Step 2: load the Term objects. # - # Note that querying using 'LIKE' is also slow, i.e: - # sql = sqltext( - # """ - # SELECT WoID FROM words - # WHERE WoLgID=:lid and WoTokenCount>1 - # AND :content LIKE '%' || :zws || WoTextLC || :zws || '%' - # """ - # ) - # sql = sql.bindparams(lid=language.id, content=content, zws=zws) + # The Term fetch is actually performant -- there is no + # real difference between loading the Term objects versus + # loading raw data with SQL and getting dicts. # - # It is actually faster to load all Term text_lc and use python to - # check if the strings are in the content string, and only then - # load the terms. - - # Multiword terms have zws between all tokens. - reclist = _get_multiword_terms(language) - # dt.step(f"mwords, loaded {len(reclist)} records") - - # Performance hit 1 - zws = "\u200B" # zero-width space - content = zws + zws.join(text_lcs) + zws - mword_terms = [p[1] for p in reclist if f"{zws}{p[1]}{zws}" in content] - # dt.step("mword terms") + # Code for getting raw data: + # param_keys = [f"w{i}" for i, _ in enumerate(text_lcs)] + # keys_placeholders = ','.join([f":{k}" for k in param_keys]) + # param_dict = dict(zip(param_keys, text_lcs)) + # param_dict["langid"] = language.id + # sql = sqltext(f"""SELECT WoID, WoTextLC FROM words + # WHERE WoLgID=:langid and WoTextLC in ({keys_placeholders})""") + # sql = sql.bindparams(language.id, *text_lcs) + # results = db.session.execute(sql, param_dict).fetchall() text_lcs.extend(mword_terms) - - # Some term entity relationship objects (tags, parents) could be - # eagerly loaded using ".options(joinedload(Term.term_tags), - # joinedload(Term.parents))", but any gains in subsequent usage - # are offset by the slower query! - # Performance hit 2 tok_strings = list(set(text_lcs)) terms_matching_tokens_qry = db.session.query(Term).filter( Term.text_lc.in_(tok_strings), Term.language == language ) - # dt.step("query prep") - all_terms = terms_matching_tokens_qry.all() # dt.step("exec query") @@ -160,23 +152,40 @@ def _find_all_terms_in_tokens(tokens, language): ## Getting paragraphs ############################## -def get_paragraphs(s, language): +def get_textitems(s, language, multiword_term_indexer=None): """ - Get array of arrays of TextItems for the given string s. - """ - # dt = DebugTimer("get_paragraphs", display=False) + Get array of TextItems for the string s. + The multiword_term_indexer is a big performance boost, but takes + time to initialize. + """ # Hacky reset of state of ParsedToken state. # _Shouldn't_ be needed but doesn't hurt, even if it's lame. ParsedToken.reset_counters() cleaned = re.sub(r" +", " ", s) tokens = language.get_parsed_tokens(cleaned) - # dt.step("get_parsed_tokens") + terms = _find_all_terms_in_tokens(tokens, language, multiword_term_indexer) + textitems = calc_get_textitems(tokens, terms, language, multiword_term_indexer) + return textitems + + +def get_multiword_indexer(language): + "Return indexer loaded with all multiword terms." + mw = MultiwordTermIndexer() + for r in _get_multiword_terms(language): + mw.add(r[1]) + return mw - terms = _find_all_terms_in_tokens(tokens, language) - textitems = get_textitems(tokens, terms, language) +def get_paragraphs(s, language): + """ + Get array of arrays of TextItems for the given string s. + + This doesn't use an indexer, as it should only be used + for a single page of text! + """ + textitems = get_textitems(s, language) def _split_textitems_by_paragraph(textitems): "Split by ¶" diff --git a/tests/unit/book/test_stats.py b/tests/unit/book/test_stats.py index dcc118843..92604a7e2 100644 --- a/tests/unit/book/test_stats.py +++ b/tests/unit/book/test_stats.py @@ -6,7 +6,12 @@ from lute.db import db from lute.term.model import Term, Repository -from lute.book.stats import get_status_distribution, refresh_stats, mark_stale +from lute.book.stats import ( + calc_status_distribution, + refresh_stats, + mark_stale, + get_stats, +) from tests.utils import make_text, make_book from tests.dbasserts import assert_record_count_equals, assert_sql_result @@ -35,7 +40,7 @@ def scenario(language, fulltext, terms_and_statuses, expected): for ts in terms_and_statuses: add_term(language, ts[0], ts[1]) - stats = get_status_distribution(b) + stats = calc_status_distribution(b) assert stats == expected @@ -141,6 +146,34 @@ def test_stats_smoke_test(_test_book, spanish): ) +def test_get_stats_calculates_and_caches_stats(_test_book, spanish): + "Calculating stats is expensive, so store them on get." + add_terms(spanish, ["gato", "TENGO"]) + assert_record_count_equals("bookstats", 0, "cache not loaded") + assert_stats([], "No stats cached at start.") + + stats = get_stats(_test_book) + assert stats.BkID == _test_book.id + assert stats.distinctterms == 4 + assert stats.distinctunknowns == 2 + assert stats.unknownpercent == 50 + assert ( + stats.status_distribution + == '{"0": 2, "1": 2, "2": 0, "3": 0, "4": 0, "5": 0, "98": 0, "99": 0}' + ) + + assert_record_count_equals("bookstats", 1, "cache loaded") + assert_stats( + ["4; 2; 50; {'0': 2, '1': 2, '2': 0, '3': 0, '4': 0, '5': 0, '98': 0, '99': 0}"] + ) + stats = get_stats(_test_book) + assert stats.BkID == _test_book.id + assert ( + stats.status_distribution + == '{"0": 2, "1": 2, "2": 0, "3": 0, "4": 0, "5": 0, "98": 0, "99": 0}' + ) + + def test_stats_calculates_rendered_text(_test_book, spanish): "Multiword term counted as one term." add_terms(spanish, ["tengo un"]) From e81b251c6c94094fbb23f5141d7423ae0c82bd54 Mon Sep 17 00:00:00 2001 From: Jeff Zohrab Date: Fri, 4 Oct 2024 16:07:43 -0700 Subject: [PATCH 3/5] Ajax in book stats graphs. --- lute/app_factory.py | 14 --------- lute/book/routes.py | 15 ++++++++++ lute/static/css/styles.css | 5 ++++ lute/templates/book/tablelisting.html | 41 +++++++++++++++++---------- 4 files changed, 46 insertions(+), 29 deletions(-) diff --git a/lute/app_factory.py b/lute/app_factory.py index 3bf1b143d..b6806aaff 100644 --- a/lute/app_factory.py +++ b/lute/app_factory.py @@ -139,7 +139,6 @@ def index(): if is_production and have_books and should_run_auto_backup: return redirect("/backup/backup", 302) - refresh_stats() warning_msg = backupservice.backup_warning(bkp_settings) backup_show_warning = ( bkp_settings.backup_warn @@ -147,11 +146,6 @@ def index(): and warning_msg != "" ) - # Disabling caching on this page so that book stats - # are recalculated, even if the user hits the browser - # "back" button after updating some terms. - # ref https://stackoverflow.com/questions/28627324/ - # disable-cache-on-a-specific-page-using-flask response = make_response( render_template( "index.html", @@ -164,26 +158,18 @@ def index(): language_choices=language_choices, current_language_id=current_language_id, is_production_data=is_production, - # Backup stats backup_show_warning=backup_show_warning, backup_warning_msg=warning_msg, ) ) - cc = "no-cache, no-store, must-revalidate, public, max-age=0" - response.headers["Cache-Control"] = cc - response.headers["Pragma"] = "no-cache" - response.headers["Expires"] = "0" return response @app.route("/refresh_all_stats") def refresh_all_stats(): books_to_update = db.session.query(Book).filter(Book.archived == 0).all() - for book in books_to_update: mark_stale(book) - refresh_stats() - return redirect("/", 302) @app.route("/wipe_database") diff --git a/lute/book/routes.py b/lute/book/routes.py index c1506ca7c..8271bbfe8 100644 --- a/lute/book/routes.py +++ b/lute/book/routes.py @@ -15,6 +15,7 @@ from lute.book import service from lute.book.datatables import get_data_tables_list from lute.book.forms import NewBookForm, EditBookForm +from lute.book.stats import get_stats import lute.utils.formutils from lute.db import db @@ -188,3 +189,17 @@ def delete(bookid): db.session.delete(b) db.session.commit() return redirect("/", 302) + + +@bp.route("/table_stats/", methods=["GET"]) +def table_stats(bookid): + "Get the stats, return ajax." + b = DBBook.find(bookid) + stats = get_stats(b) + ret = { + "distinctterms": stats.distinctterms, + "distinctunknowns": stats.distinctunknowns, + "unknownpercent": stats.unknownpercent, + "status_distribution": stats.status_distribution, + } + return jsonify(ret) diff --git a/lute/static/css/styles.css b/lute/static/css/styles.css index 00faee502..343255f74 100644 --- a/lute/static/css/styles.css +++ b/lute/static/css/styles.css @@ -227,6 +227,7 @@ table#booktable:not(:has(a.completed_book)) a.book-title:before { .refreshed { background-image: url("../icn/waiting2.gif"); + background-repeat: no-repeat; } .book-action-dropdown { @@ -2034,6 +2035,10 @@ input[type="checkbox"][disabled] + label { padding-right: 1.5rem; } +.book-stats-ajax-cell { + font-style: italic; +} + .status-bar-container-empty { border-color: #e6e6e6; background-color: #fff; diff --git a/lute/templates/book/tablelisting.html b/lute/templates/book/tablelisting.html index 3262798fd..71e95d2f1 100644 --- a/lute/templates/book/tablelisting.html +++ b/lute/templates/book/tablelisting.html @@ -93,10 +93,10 @@ { name: "LgName", width: "10%", data: "LgName" }, { name: "TagList", width: "10%", data: "TagList" }, { name: "WordCount", width: "10%", data: "WordCount" }, - { name: "UnknownPercent", render: render_book_stats_graph }, + { name: "UnknownPercent", "searchable": false, render: render_book_stats_graph_placeholder }, { width: "8%", "searchable": false, "orderable": false, render: render_book_actions }, ], - + createdRow: function(row, data, dataIndex) { ajax_in_book_stats(row, data, dataIndex); }, ajax: { url: "/book/datatables/{{ status or 'active' }}", // Additional filters. func calls are required to get the @@ -217,21 +217,32 @@ return `${row['BkTitle']}${pgfraction}`; }; + /* Replaced by the status graph after the ajax call kicked off by createdRow. */ + let render_book_stats_graph_placeholder = function(data, type, row, meta) { + return ``; + }; - let render_book_stats_graph = function(data, type, row, meta) { - const empty_stats = `
 
`; - let statuscounts = row['StatusDistribution']; - if ((statuscounts ?? '')== '') { - return empty_stats; - } - try { - statuscounts = JSON.parse(statuscounts); - } - catch(err) { - console.log(`Invalid json: ${statuscounts}`); - return empty_stats; - } + /* Ajax called from createdRow datatables hook. */ + let ajax_in_book_stats = function(row, data, dataIndex) { + var cell = $(row).find('.book-stats-ajax-cell'); + $.ajax({ + url: '/book/table_stats/' + data['BkID'], + method: 'GET', + success: function(response) { + cell.removeClass("refreshed"); + const result = JSON.parse(response.status_distribution); + const graph = render_stats_graph(result); + cell.html(graph); + }, + error: function() { + cell.text('Error loading data'); + cell.removeClass("refreshed"); + } + }); + }; + /* Generate stats graph
from statuscounts JSON. */ + let render_stats_graph = function(statuscounts) { statuscounts["99"] = statuscounts["98"] + statuscounts["99"]; delete statuscounts['98']; const totalcount = Object.values(statuscounts).reduce((acc, val) => acc + val, 0); From 09b2038224331ebdd06f97d1b4479ffbef0becc3 Mon Sep 17 00:00:00 2001 From: Jeff Zohrab Date: Fri, 4 Oct 2024 16:07:58 -0700 Subject: [PATCH 4/5] Increase number of pages for book stats. --- lute/settings/routes.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/lute/settings/routes.py b/lute/settings/routes.py index bbf3800d7..4f32d34c9 100644 --- a/lute/settings/routes.py +++ b/lute/settings/routes.py @@ -50,10 +50,8 @@ class UserSettingsForm(FlaskForm): stop_audio_on_term_form_open = BooleanField("Stop audio on term form open") stats_calc_sample_size = IntegerField( "Book stats page sample size", - validators=[InputRequired(), NumberRange(min=1, max=200)], - render_kw={ - "title": "Number of pages to use for book stats calculation. Max 200 for performance." - }, + validators=[InputRequired(), NumberRange(min=1, max=500)], + render_kw={"title": "Number of pages to use for book stats calculation."}, ) mecab_path = StringField("MECAB_PATH environment variable") From df9d40bc8497294133ba6057923563a8a316402e Mon Sep 17 00:00:00 2001 From: Jeff Zohrab Date: Fri, 4 Oct 2024 16:54:32 -0700 Subject: [PATCH 5/5] Fix lint. --- lute/book/stats.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/lute/book/stats.py b/lute/book/stats.py index 27b620524..22ea68f7c 100644 --- a/lute/book/stats.py +++ b/lute/book/stats.py @@ -26,11 +26,11 @@ def calc_status_distribution(book): Does a full render of a small number of pages to calculate the distribution. """ - txindex = 0 # DebugTimer.clear_total_summary() # dt = DebugTimer("get_status_distribution", display=False) + txindex = 0 if (book.current_tx_id or 0) != 0: for t in book.texts: if t.id == book.current_tx_id: @@ -42,19 +42,17 @@ def calc_status_distribution(book): # Getting the individual paragraphs per page, and then combining, # is much faster than combining all pages into one giant page. - lang = book.language - mw = get_multiword_indexer(lang) + mw = get_multiword_indexer(book.language) textitems = [] for tx in texts: - add_tis = [ti for ti in get_textitems(tx.text, lang, mw) if ti.is_word] - textitems.extend(add_tis) + textitems.extend(get_textitems(tx.text, book.language, mw)) # # Old slower code: # text_sample = "\n".join([t.text for t in texts]) # paras = get_paragraphs(text_sample, book.language) ... etc. # dt.step("get_paragraphs") + textitems = [ti for ti in textitems if ti.is_word] statterms = {0: [], 1: [], 2: [], 3: [], 4: [], 5: [], 98: [], 99: []} - for ti in textitems: statterms[ti.wo_status or 0].append(ti.text_lc)