LuteOrg
diff --git a/‎lute/app_factory.py
Lines changed: 0 additions & 14 deletions b/‎lute/app_factory.py
Lines changed: 0 additions & 14 deletions
diff --git a/‎lute/book/routes.py
Lines changed: 15 additions & 0 deletions b/‎lute/book/routes.py
Lines changed: 15 additions & 0 deletions
diff --git a/‎lute/book/stats.py
Lines changed: 45 additions & 41 deletions b/‎lute/book/stats.py
Lines changed: 45 additions & 41 deletions
diff --git a/‎lute/read/render/calculate_textitems.py
Lines changed: 36 additions & 14 deletions b/‎lute/read/render/calculate_textitems.py
Lines changed: 36 additions & 14 deletions
diff --git a/‎lute/read/render/multiword_indexer.py
Lines changed: 40 additions & 0 deletions b/‎lute/read/render/multiword_indexer.py
Lines changed: 40 additions & 0 deletions
@@ -139,19 +139,13 @@ def index():
         if is_production and have_books and should_run_auto_backup:
             return redirect("/backup/backup", 302)
 
-        refresh_stats()
         warning_msg = backupservice.backup_warning(bkp_settings)
         backup_show_warning = (
             bkp_settings.backup_warn
             and bkp_settings.backup_enabled
             and warning_msg != ""
         )
 
-        # Disabling caching on this page so that book stats
-        # are recalculated, even if the user hits the browser
-        # "back" button after updating some terms.
-        # ref https://stackoverflow.com/questions/28627324/
-        #   disable-cache-on-a-specific-page-using-flask
         response = make_response(
             render_template(
                 "index.html",
@@ -164,26 +158,18 @@ def index():
                 language_choices=language_choices,
                 current_language_id=current_language_id,
                 is_production_data=is_production,
-                # Backup stats
                 backup_show_warning=backup_show_warning,
                 backup_warning_msg=warning_msg,
             )
         )
-        cc = "no-cache, no-store, must-revalidate, public, max-age=0"
-        response.headers["Cache-Control"] = cc
-        response.headers["Pragma"] = "no-cache"
-        response.headers["Expires"] = "0"
         return response
 
     @app.route("/refresh_all_stats")
     def refresh_all_stats():
         books_to_update = db.session.query(Book).filter(Book.archived == 0).all()
-
         for book in books_to_update:
             mark_stale(book)
-
         refresh_stats()
-
         return redirect("/", 302)
 
     @app.route("/wipe_database")
 
@@ -15,6 +15,7 @@
 from lute.book import service
 from lute.book.datatables import get_data_tables_list
 from lute.book.forms import NewBookForm, EditBookForm
+from lute.book.stats import get_stats
 import lute.utils.formutils
 from lute.db import db
 
@@ -188,3 +189,17 @@ def delete(bookid):
     db.session.delete(b)
     db.session.commit()
     return redirect("/", 302)
+
+
+@bp.route("/table_stats/<int:bookid>", methods=["GET"])
+def table_stats(bookid):
+    "Get the stats, return ajax."
+    b = DBBook.find(bookid)
+    stats = get_stats(b)
+    ret = {
+        "distinctterms": stats.distinctterms,
+        "distinctunknowns": stats.distinctunknowns,
+        "unknownpercent": stats.unknownpercent,
+        "status_distribution": stats.status_distribution,
+    }
+    return jsonify(ret)
@@ -3,7 +3,7 @@
 """
 
 import json
-from lute.read.render.service import get_paragraphs
+from lute.read.render.service import get_multiword_indexer, get_textitems
 from lute.db import db
 from lute.models.book import Book
 from lute.models.setting import UserSetting
@@ -19,51 +19,41 @@ def _last_n_pages(book, txindex, n):
     return texts[-n:]
 
 
-def get_status_distribution(book):
+def calc_status_distribution(book):
     """
-    Return statuses and count of unique words per status.
+    Calculate statuses and count of unique words per status.
 
     Does a full render of a small number of pages
     to calculate the distribution.
     """
-    txindex = 0
 
-    # dt = DebugTimer("get_status_distribution", display=True)
+    # DebugTimer.clear_total_summary()
+    # dt = DebugTimer("get_status_distribution", display=False)
 
+    txindex = 0
     if (book.current_tx_id or 0) != 0:
         for t in book.texts:
             if t.id == book.current_tx_id:
                 break
             txindex += 1
 
-    # Use a sample of pages to speed up stats count.
     sample_size = int(UserSetting.get_value("stats_calc_sample_size") or 5)
     texts = _last_n_pages(book, txindex, sample_size)
 
     # Getting the individual paragraphs per page, and then combining,
     # is much faster than combining all pages into one giant page.
-    paras = [get_paragraphs(t.text, book.language) for t in texts]
+    mw = get_multiword_indexer(book.language)
+    textitems = []
+    for tx in texts:
+        textitems.extend(get_textitems(tx.text, book.language, mw))
     # # Old slower code:
     # text_sample = "\n".join([t.text for t in texts])
-    # paras = get_paragraphs(text_sample, book.language)
-
+    # paras = get_paragraphs(text_sample, book.language) ... etc.
     # dt.step("get_paragraphs")
-    # DebugTimer.total_summary()
-
-    def flatten_list(nested_list):
-        result = []
-        for item in nested_list:
-            if isinstance(item, list):
-                result.extend(flatten_list(item))
-            else:
-                result.append(item)
-        return result
-
-    text_items = [ti for ti in flatten_list(paras) if ti.is_word]
 
+    textitems = [ti for ti in textitems if ti.is_word]
     statterms = {0: [], 1: [], 2: [], 3: [], 4: [], 5: [], 98: [], 99: []}
-
-    for ti in text_items:
+    for ti in textitems:
         statterms[ti.wo_status or 0].append(ti.text_lc)
 
     stats = {}
@@ -72,6 +62,9 @@ def flatten_list(nested_list):
         statterms[statusval] = uniques
         stats[statusval] = len(uniques)
 
+    # dt.step("compiled")
+    # DebugTimer.total_summary()
+
     return stats
 
 
@@ -83,8 +76,7 @@ class BookStats(db.Model):
     "The stats table."
     __tablename__ = "bookstats"
 
-    id = db.Column(db.Integer, primary_key=True)
-    BkID = db.Column(db.Integer)
+    BkID = db.Column(db.Integer, primary_key=True)
     distinctterms = db.Column(db.Integer)
     distinctunknowns = db.Column(db.Integer)
     unknownpercent = db.Column(db.Integer)
@@ -100,7 +92,7 @@ def refresh_stats():
     )
     books = [b for b in books_to_update if b.is_supported]
     for book in books:
-        stats = _get_stats(book)
+        stats = _calculate_stats(book)
         _update_stats(book, stats)
 
 
@@ -111,31 +103,43 @@ def mark_stale(book):
     db.session.commit()
 
 
-def _get_stats(book):
+def get_stats(book):
+    "Gets stats from the cache if available, or calculates."
+    bk_id = book.id
+    stats = db.session.query(BookStats).filter_by(BkID=bk_id).first()
+    if stats is None:
+        newstats = _calculate_stats(book)
+        _update_stats(book, newstats)
+        stats = db.session.query(BookStats).filter_by(BkID=bk_id).first()
+    return stats
+
+
+def _calculate_stats(book):
     "Calc stats for the book using the status distribution."
-    status_distribution = get_status_distribution(book)
+    status_distribution = calc_status_distribution(book)
     unknowns = status_distribution[0]
     allunique = sum(status_distribution.values())
 
     percent = 0
     if allunique > 0:  # In case not parsed.
         percent = round(100.0 * unknowns / allunique)
 
-    sd = json.dumps(status_distribution)
-
-    # Any change in the below fields requires a change to
-    # update_stats as well, query insert doesn't check field order.
-    return [allunique, unknowns, percent, sd]
+    return {
+        "allunique": allunique,
+        "unknowns": unknowns,
+        "percent": percent,
+        "distribution": json.dumps(status_distribution),
+    }
 
 
 def _update_stats(book, stats):
     "Update BookStats for the given book."
-    new_stats = BookStats(
-        BkID=book.id,
-        distinctterms=stats[0],
-        distinctunknowns=stats[1],
-        unknownpercent=stats[2],
-        status_distribution=stats[3],
-    )
-    db.session.add(new_stats)
+    s = db.session.query(BookStats).filter_by(BkID=book.id).first()
+    if s is None:
+        s = BookStats(BkID=book.id)
+    s.distinctterms = stats["allunique"]
+    s.distinctunknowns = stats["unknowns"]
+    s.unknownpercent = stats["percent"]
+    s.status_distribution = stats["distribution"]
+    db.session.add(s)
     db.session.commit()
@@ -20,9 +20,12 @@
 """
 
 import re
+from collections import Counter
 from lute.models.term import Term
 from lute.read.render.text_item import TextItem
 
+# from lute.utils.debug_helpers import DebugTimer
+
 zws = "\u200B"  # zero-width space
 
 
@@ -55,13 +58,14 @@ def get_string_indexes(strings, content):
     return ret
 
 
-def _make_textitem(index, text, text_lc, sentence_number, term):
+# pylint: disable=too-many-arguments
+def _make_textitem(index, text, text_lc, count, sentence_number, term):
     "Make a TextItem."
     r = TextItem()
     r.text = text
     r.sentence_number = sentence_number
     r.text_lc = text_lc
-    r.token_count = text.count(zws) + 1
+    r.token_count = count
     r.display_count = r.token_count
     r.index = index
     r.is_word = term is not None
@@ -91,7 +95,7 @@ def _create_missing_status_0_terms(tokens, terms, language):
     return new_terms
 
 
-def get_textitems(tokens, terms, language):
+def get_textitems(tokens, terms, language, multiword_term_indexer=None):
     """
     Return TextItems that will **actually be rendered**.
 
@@ -185,34 +189,48 @@ def get_textitems(tokens, terms, language):
     """
     # pylint: disable=too-many-locals
 
+    # dt = DebugTimer("get_textitems", display=False)
+
     new_unknown_terms = _create_missing_status_0_terms(tokens, terms, language)
+    # dt.step("new_unknown_terms")
 
     all_terms = terms + new_unknown_terms
-
     text_to_term = {dt.text_lc: dt for dt in all_terms}
 
-    tokens_lc = [language.parser.get_lowercase(t.token) for t in tokens]
+    tokens_orig = [t.token for t in tokens]
+    tokens_lc = [language.parser.get_lowercase(t) for t in tokens_orig]
 
     textitems = []
 
-    def _add_textitem(index, text_lc):
+    def _add_textitem(index, text_lc, count):
         "Add a TextItem for position index in tokens."
-        count = text_lc.count(zws) + 1
-        text_orig = zws.join([t.token for t in tokens[index : index + count]])
+        text_orig = tokens_orig[index]
+        if count > 1:
+            text_orig = zws.join(tokens_orig[index : index + count])
         text_lc = zws.join(tokens_lc[index : index + count])
         sentence_number = tokens[index].sentence_number
         term = text_to_term.get(text_lc, None)
-        ti = _make_textitem(index, text_orig, text_lc, sentence_number, term)
+        ti = _make_textitem(index, text_orig, text_lc, count, sentence_number, term)
         textitems.append(ti)
 
     # Single-word terms.
     for index, _ in enumerate(tokens):
-        _add_textitem(index, tokens_lc[index])
+        _add_textitem(index, tokens_lc[index], 1)
+    # dt.step("single word textitems")
 
     # Multiword terms.
-    multiword_terms = [t.text_lc for t in all_terms if t.token_count > 1]
-    for e in get_string_indexes(multiword_terms, zws.join(tokens_lc)):
-        _add_textitem(e[1], e[0])
+    if multiword_term_indexer is not None:
+        for r in multiword_term_indexer.search_all(tokens_lc):
+            mwt = text_to_term[r[0]]
+            count = mwt.token_count
+            _add_textitem(r[1], r[0], count)
+        # dt.step(f"get mw textitems w indexer")
+    else:
+        multiword_terms = [t.text_lc for t in all_terms if t.token_count > 1]
+        for e in get_string_indexes(multiword_terms, zws.join(tokens_lc)):
+            count = e[0].count(zws) + 1
+            _add_textitem(e[1], e[0], count)
+        # dt.step("mw textitems without indexer")
 
     # Sorting by index, then decreasing token count.
     textitems = sorted(textitems, key=lambda x: (x.index, -x.token_count))
@@ -225,8 +243,10 @@ def _add_textitem(index, text_lc):
 
     # Calc display_counts; e.g. if a textitem's id shows up 3 times
     # in the output_textitem_ids, it should display 3 tokens.
+    id_counts = dict(Counter(output_textitem_ids))
     for ti in textitems:
-        ti.display_count = output_textitem_ids.count(id(ti))
+        ti.display_count = id_counts.get(id(ti), 0)
+    # dt.step("display_count")
 
     textitems = [ti for ti in textitems if ti.display_count > 0]
 
@@ -235,5 +255,7 @@ def _add_textitem(index, text_lc):
         ti.paragraph_number = current_paragraph
         if ti.text == "¶":
             current_paragraph += 1
+    # dt.step("paragraphs")
+    # dt.step("done")
 
     return textitems
@@ -0,0 +1,40 @@
+"""
+Find terms in contest string using ahocorapy.
+"""
+
+from ahocorapy.keywordtree import KeywordTree
+
+
+class MultiwordTermIndexer:
+    """
+    Find terms in strings using ahocorapy.
+    """
+
+    zws = "\u200B"  # zero-width space
+
+    def __init__(self):
+        self.kwtree = KeywordTree(case_insensitive=True)
+        self.finalized = False
+
+    def add(self, t):
+        "Add zws-enclosed term to tree."
+        add_t = f"{self.zws}{t}{self.zws}"
+        self.kwtree.add(add_t)
+
+    def search_all(self, lc_tokens):
+        "Find all terms and starting token index."
+        if not self.finalized:
+            self.kwtree.finalize()
+            self.finalized = True
+
+        zws = self.zws
+        content = zws + zws.join(lc_tokens) + zws
+        zwsindexes = [i for i, char in enumerate(content) if char == zws]
+        results = self.kwtree.search_all(content)
+
+        for result in results:
+            # print(f"{result}\n", flush=True)
+            t = result[0].strip(zws)
+            charpos = result[1]
+            index = zwsindexes.index(charpos)
+            yield (t, index)