From ed40ff07128caad6a241ebfd9dc6058143803dda Mon Sep 17 00:00:00 2001
From: Jeff Zohrab <jzohrab@gmail.com>
Date: Fri, 4 Oct 2024 16:00:25 -0700
Subject: [PATCH 1/5] Add indexer using ahocorapy.

---
 lute/read/render/multiword_indexer.py         | 40 +++++++++++++
 requirements.txt                              |  2 +
 .../read/render/test_multiword_indexer.py     | 57 +++++++++++++++++++
 3 files changed, 99 insertions(+)
 create mode 100644 lute/read/render/multiword_indexer.py
 create mode 100644 tests/unit/read/render/test_multiword_indexer.py

diff --git a/lute/read/render/multiword_indexer.py b/lute/read/render/multiword_indexer.py
new file mode 100644
index 000000000..dbbc80087
--- /dev/null
+++ b/lute/read/render/multiword_indexer.py
@@ -0,0 +1,40 @@
+"""
+Find terms in contest string using ahocorapy.
+"""
+
+from ahocorapy.keywordtree import KeywordTree
+
+
+class MultiwordTermIndexer:
+    """
+    Find terms in strings using ahocorapy.
+    """
+
+    zws = "\u200B"  # zero-width space
+
+    def __init__(self):
+        self.kwtree = KeywordTree(case_insensitive=True)
+        self.finalized = False
+
+    def add(self, t):
+        "Add zws-enclosed term to tree."
+        add_t = f"{self.zws}{t}{self.zws}"
+        self.kwtree.add(add_t)
+
+    def search_all(self, lc_tokens):
+        "Find all terms and starting token index."
+        if not self.finalized:
+            self.kwtree.finalize()
+            self.finalized = True
+
+        zws = self.zws
+        content = zws + zws.join(lc_tokens) + zws
+        zwsindexes = [i for i, char in enumerate(content) if char == zws]
+        results = self.kwtree.search_all(content)
+
+        for result in results:
+            # print(f"{result}\n", flush=True)
+            t = result[0].strip(zws)
+            charpos = result[1]
+            index = zwsindexes.index(charpos)
+            yield (t, index)
diff --git a/requirements.txt b/requirements.txt
index 3109b4361..06604d4bf 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,4 @@
+ahocorapy==1.6.2
 astroid==2.15.6
 attrs==23.1.0
 beautifulsoup4==4.12.2
@@ -20,6 +21,7 @@ Flask-SQLAlchemy==3.1.1
 Flask-WTF==1.2.1
 flit==3.9.0
 flit_core==3.9.0
+future==1.0.0
 greenlet==3.0.0
 h11==0.14.0
 identify==2.5.31
diff --git a/tests/unit/read/render/test_multiword_indexer.py b/tests/unit/read/render/test_multiword_indexer.py
new file mode 100644
index 000000000..fefd8935c
--- /dev/null
+++ b/tests/unit/read/render/test_multiword_indexer.py
@@ -0,0 +1,57 @@
+"""
+get_string_indexes tests.
+"""
+
+import pytest
+from lute.read.render.multiword_indexer import MultiwordTermIndexer
+
+zws = "\u200B"  # zero-width space
+
+
+@pytest.mark.parametrize(
+    "name,terms,tokens,expected",
+    [
+        ("empty", [], ["a"], []),
+        ("no terms", [], ["a"], []),
+        ("no tokens", ["a"], [], []),
+        ("no match", ["x"], ["a"], []),
+        ("single match", ["a"], ["a"], [("a", 0)]),
+        ("single match 2", ["a"], ["b", "a"], [("a", 1)]),
+        ("same term twice", ["a"], ["b", "a", "c", "a"], [("a", 1), ("a", 3)]),
+        (
+            "multiple terms",
+            ["a", "b"],
+            ["b", "a", "c", "a"],
+            [("b", 0), ("a", 1), ("a", 3)],
+        ),
+        ("multi-word term", [f"a{zws}b"], ["b", "a", "b", "a"], [(f"a{zws}b", 1)]),
+        (
+            "repeated m-word term",
+            [f"a{zws}a"],
+            ["a", "a", "a", "b"],
+            [(f"a{zws}a", 0), (f"a{zws}a", 1)],
+        ),
+        ("bound check term at end", ["a"], ["b", "c", "a"], [("a", 2)]),
+    ],
+)
+def test_scenario(name, terms, tokens, expected):
+    "Test scenario."
+    mw = MultiwordTermIndexer()
+    for t in terms:
+        mw.add(t)
+    results = list(mw.search_all(tokens))
+    assert len(results) == len(expected), name
+    assert results == expected, name
+
+
+def test_can_search_multiple_times_with_different_tokens():
+    "Single match, returns token index."
+    mw = MultiwordTermIndexer()
+    mw.add("a")
+    results = list(mw.search_all(["a", "b"]))
+    assert len(results) == 1, "one match"
+    assert results[0] == ("a", 0)
+
+    results = list(mw.search_all(["b", "a"]))
+    assert len(results) == 1, "one match"
+    assert results[0] == ("a", 1)

From 049f643d716403871644bc6c40d687478ca47aac Mon Sep 17 00:00:00 2001
From: Jeff Zohrab <jzohrab@gmail.com>
Date: Fri, 4 Oct 2024 16:06:53 -0700
Subject: [PATCH 2/5] Use new indexer, reorg code.

---
 lute/book/stats.py                      |  84 +++++-----
 lute/read/render/calculate_textitems.py |  50 ++++--
 lute/read/render/service.py             | 195 +++++++++++++-----------
 tests/unit/book/test_stats.py           |  37 ++++-
 4 files changed, 218 insertions(+), 148 deletions(-)

diff --git a/lute/book/stats.py b/lute/book/stats.py
index c4c62ff50..27b620524 100644
--- a/lute/book/stats.py
+++ b/lute/book/stats.py
@@ -3,7 +3,7 @@
 """
 
 import json
-from lute.read.render.service import get_paragraphs
+from lute.read.render.service import get_multiword_indexer, get_textitems
 from lute.db import db
 from lute.models.book import Book
 from lute.models.setting import UserSetting
@@ -19,16 +19,17 @@ def _last_n_pages(book, txindex, n):
     return texts[-n:]
 
 
-def get_status_distribution(book):
+def calc_status_distribution(book):
     """
-    Return statuses and count of unique words per status.
+    Calculate statuses and count of unique words per status.
 
     Does a full render of a small number of pages
     to calculate the distribution.
     """
     txindex = 0
 
-    # dt = DebugTimer("get_status_distribution", display=True)
+    # DebugTimer.clear_total_summary()
+    # dt = DebugTimer("get_status_distribution", display=False)
 
     if (book.current_tx_id or 0) != 0:
         for t in book.texts:
@@ -36,34 +37,25 @@ def get_status_distribution(book):
                 break
             txindex += 1
 
-    # Use a sample of pages to speed up stats count.
     sample_size = int(UserSetting.get_value("stats_calc_sample_size") or 5)
     texts = _last_n_pages(book, txindex, sample_size)
 
     # Getting the individual paragraphs per page, and then combining,
     # is much faster than combining all pages into one giant page.
-    paras = [get_paragraphs(t.text, book.language) for t in texts]
+    lang = book.language
+    mw = get_multiword_indexer(lang)
+    textitems = []
+    for tx in texts:
+        add_tis = [ti for ti in get_textitems(tx.text, lang, mw) if ti.is_word]
+        textitems.extend(add_tis)
     # # Old slower code:
     # text_sample = "\n".join([t.text for t in texts])
-    # paras = get_paragraphs(text_sample, book.language)
-
+    # paras = get_paragraphs(text_sample, book.language) ... etc.
     # dt.step("get_paragraphs")
-    # DebugTimer.total_summary()
-
-    def flatten_list(nested_list):
-        result = []
-        for item in nested_list:
-            if isinstance(item, list):
-                result.extend(flatten_list(item))
-            else:
-                result.append(item)
-        return result
-
-    text_items = [ti for ti in flatten_list(paras) if ti.is_word]
 
     statterms = {0: [], 1: [], 2: [], 3: [], 4: [], 5: [], 98: [], 99: []}
 
-    for ti in text_items:
+    for ti in textitems:
         statterms[ti.wo_status or 0].append(ti.text_lc)
 
     stats = {}
@@ -72,6 +64,9 @@ def flatten_list(nested_list):
         statterms[statusval] = uniques
         stats[statusval] = len(uniques)
 
+    # dt.step("compiled")
+    # DebugTimer.total_summary()
+
     return stats
 
 
@@ -83,8 +78,7 @@ class BookStats(db.Model):
     "The stats table."
     __tablename__ = "bookstats"
 
-    id = db.Column(db.Integer, primary_key=True)
-    BkID = db.Column(db.Integer)
+    BkID = db.Column(db.Integer, primary_key=True)
     distinctterms = db.Column(db.Integer)
     distinctunknowns = db.Column(db.Integer)
     unknownpercent = db.Column(db.Integer)
@@ -100,7 +94,7 @@ def refresh_stats():
     )
     books = [b for b in books_to_update if b.is_supported]
     for book in books:
-        stats = _get_stats(book)
+        stats = _calculate_stats(book)
         _update_stats(book, stats)
 
 
@@ -111,9 +105,20 @@ def mark_stale(book):
     db.session.commit()
 
 
-def _get_stats(book):
+def get_stats(book):
+    "Gets stats from the cache if available, or calculates."
+    bk_id = book.id
+    stats = db.session.query(BookStats).filter_by(BkID=bk_id).first()
+    if stats is None:
+        newstats = _calculate_stats(book)
+        _update_stats(book, newstats)
+        stats = db.session.query(BookStats).filter_by(BkID=bk_id).first()
+    return stats
+
+
+def _calculate_stats(book):
     "Calc stats for the book using the status distribution."
-    status_distribution = get_status_distribution(book)
+    status_distribution = calc_status_distribution(book)
     unknowns = status_distribution[0]
     allunique = sum(status_distribution.values())
 
@@ -121,21 +126,22 @@ def _get_stats(book):
     if allunique > 0:  # In case not parsed.
         percent = round(100.0 * unknowns / allunique)
 
-    sd = json.dumps(status_distribution)
-
-    # Any change in the below fields requires a change to
-    # update_stats as well, query insert doesn't check field order.
-    return [allunique, unknowns, percent, sd]
+    return {
+        "allunique": allunique,
+        "unknowns": unknowns,
+        "percent": percent,
+        "distribution": json.dumps(status_distribution),
+    }
 
 
 def _update_stats(book, stats):
     "Update BookStats for the given book."
-    new_stats = BookStats(
-        BkID=book.id,
-        distinctterms=stats[0],
-        distinctunknowns=stats[1],
-        unknownpercent=stats[2],
-        status_distribution=stats[3],
-    )
-    db.session.add(new_stats)
+    s = db.session.query(BookStats).filter_by(BkID=book.id).first()
+    if s is None:
+        s = BookStats(BkID=book.id)
+    s.distinctterms = stats["allunique"]
+    s.distinctunknowns = stats["unknowns"]
+    s.unknownpercent = stats["percent"]
+    s.status_distribution = stats["distribution"]
+    db.session.add(s)
     db.session.commit()
diff --git a/lute/read/render/calculate_textitems.py b/lute/read/render/calculate_textitems.py
index a9a7267ea..b902c6ec3 100644
--- a/lute/read/render/calculate_textitems.py
+++ b/lute/read/render/calculate_textitems.py
@@ -20,9 +20,12 @@
 """
 
 import re
+from collections import Counter
 from lute.models.term import Term
 from lute.read.render.text_item import TextItem
 
+# from lute.utils.debug_helpers import DebugTimer
+
 zws = "\u200B"  # zero-width space
 
 
@@ -55,13 +58,14 @@ def get_string_indexes(strings, content):
     return ret
 
 
-def _make_textitem(index, text, text_lc, sentence_number, term):
+# pylint: disable=too-many-arguments
+def _make_textitem(index, text, text_lc, count, sentence_number, term):
     "Make a TextItem."
     r = TextItem()
     r.text = text
     r.sentence_number = sentence_number
     r.text_lc = text_lc
-    r.token_count = text.count(zws) + 1
+    r.token_count = count
     r.display_count = r.token_count
     r.index = index
     r.is_word = term is not None
@@ -91,7 +95,7 @@ def _create_missing_status_0_terms(tokens, terms, language):
     return new_terms
 
 
-def get_textitems(tokens, terms, language):
+def get_textitems(tokens, terms, language, multiword_term_indexer=None):
     """
     Return TextItems that will **actually be rendered**.
 
@@ -185,34 +189,48 @@ def get_textitems(tokens, terms, language):
     """
     # pylint: disable=too-many-locals
 
+    # dt = DebugTimer("get_textitems", display=False)
+
     new_unknown_terms = _create_missing_status_0_terms(tokens, terms, language)
+    # dt.step("new_unknown_terms")
 
     all_terms = terms + new_unknown_terms
-
     text_to_term = {dt.text_lc: dt for dt in all_terms}
 
-    tokens_lc = [language.parser.get_lowercase(t.token) for t in tokens]
+    tokens_orig = [t.token for t in tokens]
+    tokens_lc = [language.parser.get_lowercase(t) for t in tokens_orig]
 
     textitems = []
 
-    def _add_textitem(index, text_lc):
+    def _add_textitem(index, text_lc, count):
         "Add a TextItem for position index in tokens."
-        count = text_lc.count(zws) + 1
-        text_orig = zws.join([t.token for t in tokens[index : index + count]])
+        text_orig = tokens_orig[index]
+        if count > 1:
+            text_orig = zws.join(tokens_orig[index : index + count])
         text_lc = zws.join(tokens_lc[index : index + count])
         sentence_number = tokens[index].sentence_number
         term = text_to_term.get(text_lc, None)
-        ti = _make_textitem(index, text_orig, text_lc, sentence_number, term)
+        ti = _make_textitem(index, text_orig, text_lc, count, sentence_number, term)
         textitems.append(ti)
 
     # Single-word terms.
     for index, _ in enumerate(tokens):
-        _add_textitem(index, tokens_lc[index])
+        _add_textitem(index, tokens_lc[index], 1)
+    # dt.step("single word textitems")
 
     # Multiword terms.
-    multiword_terms = [t.text_lc for t in all_terms if t.token_count > 1]
-    for e in get_string_indexes(multiword_terms, zws.join(tokens_lc)):
-        _add_textitem(e[1], e[0])
+    if multiword_term_indexer is not None:
+        for r in multiword_term_indexer.search_all(tokens_lc):
+            mwt = text_to_term[r[0]]
+            count = mwt.token_count
+            _add_textitem(r[1], r[0], count)
+        # dt.step(f"get mw textitems w indexer")
+    else:
+        multiword_terms = [t.text_lc for t in all_terms if t.token_count > 1]
+        for e in get_string_indexes(multiword_terms, zws.join(tokens_lc)):
+            count = e[0].count(zws) + 1
+            _add_textitem(e[1], e[0], count)
+        # dt.step("mw textitems without indexer")
 
     # Sorting by index, then decreasing token count.
     textitems = sorted(textitems, key=lambda x: (x.index, -x.token_count))
@@ -225,8 +243,10 @@ def _add_textitem(index, text_lc):
 
     # Calc display_counts; e.g. if a textitem's id shows up 3 times
     # in the output_textitem_ids, it should display 3 tokens.
+    id_counts = dict(Counter(output_textitem_ids))
     for ti in textitems:
-        ti.display_count = output_textitem_ids.count(id(ti))
+        ti.display_count = id_counts.get(id(ti), 0)
+    # dt.step("display_count")
 
     textitems = [ti for ti in textitems if ti.display_count > 0]
 
@@ -235,5 +255,7 @@ def _add_textitem(index, text_lc):
         ti.paragraph_number = current_paragraph
         if ti.text == "¶":
             current_paragraph += 1
+    # dt.step("paragraphs")
+    # dt.step("done")
 
     return textitems
diff --git a/lute/read/render/service.py b/lute/read/render/service.py
index 849f32476..433a012b0 100644
--- a/lute/read/render/service.py
+++ b/lute/read/render/service.py
@@ -8,7 +8,8 @@
 
 from lute.models.term import Term
 from lute.parse.base import ParsedToken
-from lute.read.render.calculate_textitems import get_textitems
+from lute.read.render.calculate_textitems import get_textitems as calc_get_textitems
+from lute.read.render.multiword_indexer import MultiwordTermIndexer
 from lute.db import db
 
 # from lute.utils.debug_helpers import DebugTimer
@@ -24,30 +25,13 @@ def find_all_Terms_in_string(s, language):  # pylint: disable=too-many-locals
 
     This would return the terms "cat" and "a cat".
     """
-
     cleaned = re.sub(r" +", " ", s)
     tokens = language.get_parsed_tokens(cleaned)
     return _find_all_terms_in_tokens(tokens, language)
 
 
-# TODO cache_multiword_terms.
-#
-# Caching all multiword terms cuts down stats calculation time.
-# e.g. when calculating stats on 100 pages, the time goes from 0.7s to 0.01s.
-#
-# Have to sort out cache invalidation (esp for unit tests), and
-# separate caches for each language.
-# _cached_multiword_terms = None
-
-
 def _get_multiword_terms(language):
     "Get all multiword terms."
-
-    # TODO cache_multiword_terms.
-    # global _cached_multiword_terms
-    # if _cached_multiword_terms is not None:
-    #    return _cached_multiword_terms
-
     sql = sqltext(
         """
         SELECT WoID, WoTextLC FROM words
@@ -55,11 +39,54 @@ def _get_multiword_terms(language):
         """
     )
     sql = sql.bindparams(language_id=language.id)
-    _cached_multiword_terms = db.session.execute(sql).all()
-    return _cached_multiword_terms
+    return db.session.execute(sql).all()
+
 
+def _find_all_multi_word_term_text_lcs_in_content(text_lcs, language):
+    "Find multiword terms, return list of text_lcs."
 
-def _find_all_terms_in_tokens(tokens, language):
+    # There are a few ways of finding multi-word Terms
+    # (with token_count > 1) in the content:
+    #
+    # 1. load each mword term text_lc via sql and check.
+    # 2. using the model
+    # 3. SQL with "LIKE"
+    #
+    # During reasonable test runs with my data, the times in seconds
+    # for each are similar (~0.02, ~0.05, ~0.025).  This method is
+    # only used for small amounts of data, and the user experience hit
+    # is negligible, so I'll use the first method which IMO is the clearest
+    # code.
+
+    zws = "\u200B"  # zero-width space
+    content = zws + zws.join(text_lcs) + zws
+
+    # Method 1:
+    reclist = _get_multiword_terms(language)
+    return [p[1] for p in reclist if f"{zws}{p[1]}{zws}" in content]
+
+    ## # Method 2: use the model.
+    ## contained_term_qry = db.session.query(Term).filter(
+    ##     Term.language == language,
+    ##     Term.token_count > 1,
+    ##     func.instr(content, Term.text_lc) > 0,
+    ## )
+    ## return [r.text_lc for r in contained_term_qry.all()]
+
+    ## # Method 3: Query with LIKE
+    ## sql = sqltext(
+    ##     """
+    ##     SELECT WoTextLC FROM words
+    ##     WHERE WoLgID=:lid and WoTokenCount>1
+    ##     AND :content LIKE '%' || :zws || WoTextLC || :zws || '%'
+    ##     """
+    ## )
+    ## sql = sql.bindparams(lid=language.id, content=content, zws=zws)
+    ## recs = db.session.execute(sql).all()
+    ## return [r[0] for r in recs]
+
+
+def _find_all_terms_in_tokens(tokens, language, kwtree=None):
     """
     Find all terms contained in the (ordered) parsed tokens tokens.
 
@@ -73,84 +100,49 @@ def _find_all_terms_in_tokens(tokens, language):
     - build list of lowercase text in the tokens
     - append all multword term strings that exist in the content
     - query for Terms that exist in the list
-    """
 
-    # Performance breakdown:
-    #
-    # About half of the time is spent in "performance hit 1",
-    # filtering the multiword terms to find those contained in the
-    # text.  A bit less than half is spent is "performance hit 2", the
-    # actual query.
-    #
-    # Future performance improvement considerations:
-    #
-    # 1. I considered keeping a cache of multiword terms strings and
-    # IDs, but IMO the payoff isn't worth the extra complexity at this
-    # time.
-    #
-    # 2. Maybe a different search method like Aho-Corasick (ref
-    # https://github.com/abusix/ahocorapy) would be useful ... again
-    # it would imply that all keywords (existing Terms) are loaded
-    # into the Aho-Corasick automaton.  This could be cached, but would
-    # again need methods for cache invalidation and reload etc.
+    Note: this method only uses indexes for multiword terms, as any
+    content analyzed is first parsed into tokens before being passed
+    to this routine.  There's no need to search for single-word Terms
+    in the tokenized strings, they can be found by a simple query.
+    """
 
-    # dt = DebugTimer("_find_all_terms_in_tokens", display=False)
+    # Performance: About half of the time in this routine is spent in
+    # Step 1 (finding multiword terms), the rest in step 2 (the actual
+    # query).
+    # dt = DebugTimer("_find_all_terms_in_tokens", display=True)
 
     parser = language.parser
-
-    # Each token can map to a single-word Term.
     text_lcs = [parser.get_lowercase(t.token) for t in tokens]
 
-    # Multiword terms
-    #
-    # Multiword terms are harder to find as we have to do a full text
-    # match.
-    #
-    # The "obvious" method of using the model is quite slow:
-    #
-    #   contained_term_qry = db.session.query(Term).filter(
-    #     Term.language == language,
-    #     Term.token_count > 1,
-    #     func.instr(content, Term.text_lc) > 0,
-    #   )
-    #   contained_terms = contained_term_qry.all()
+    # Step 1: get the multiwords in the content.
+    if kwtree is None:
+        mword_terms = _find_all_multi_word_term_text_lcs_in_content(text_lcs, language)
+    else:
+        results = kwtree.search_all(text_lcs)
+        mword_terms = [r[0] for r in results]
+    # dt.step("filtered mword terms")
+
+    # Step 2: load the Term objects.
     #
-    # Note that querying using 'LIKE' is also slow, i.e:
-    #   sql = sqltext(
-    #     """
-    #     SELECT WoID FROM words
-    #     WHERE WoLgID=:lid and WoTokenCount>1
-    #     AND :content LIKE '%' || :zws || WoTextLC || :zws || '%'
-    #     """
-    #   )
-    #   sql = sql.bindparams(lid=language.id, content=content, zws=zws)
+    # The Term fetch is actually performant -- there is no
+    # real difference between loading the Term objects versus
+    # loading raw data with SQL and getting dicts.
     #
-    # It is actually faster to load all Term text_lc and use python to
-    # check if the strings are in the content string, and only then
-    # load the terms.
-
-    # Multiword terms have zws between all tokens.
-    reclist = _get_multiword_terms(language)
-    # dt.step(f"mwords, loaded {len(reclist)} records")
-
-    # Performance hit 1
-    zws = "\u200B"  # zero-width space
-    content = zws + zws.join(text_lcs) + zws
-    mword_terms = [p[1] for p in reclist if f"{zws}{p[1]}{zws}" in content]
-    # dt.step("mword terms")
+    # Code for getting raw data:
+    # param_keys = [f"w{i}" for i, _ in enumerate(text_lcs)]
+    # keys_placeholders = ','.join([f":{k}" for k in param_keys])
+    # param_dict = dict(zip(param_keys, text_lcs))
+    # param_dict["langid"] = language.id
+    # sql = sqltext(f"""SELECT WoID, WoTextLC FROM words
+    #     WHERE WoLgID=:langid and WoTextLC in ({keys_placeholders})""")
+    # sql = sql.bindparams(language.id, *text_lcs)
+    # results = db.session.execute(sql, param_dict).fetchall()
     text_lcs.extend(mword_terms)
-
-    # Some term entity relationship objects (tags, parents) could be
-    # eagerly loaded using ".options(joinedload(Term.term_tags),
-    # joinedload(Term.parents))", but any gains in subsequent usage
-    # are offset by the slower query!
-    # Performance hit 2
     tok_strings = list(set(text_lcs))
     terms_matching_tokens_qry = db.session.query(Term).filter(
         Term.text_lc.in_(tok_strings), Term.language == language
     )
-    # dt.step("query prep")
-
     all_terms = terms_matching_tokens_qry.all()
     # dt.step("exec query")
 
@@ -160,23 +152,40 @@ def _find_all_terms_in_tokens(tokens, language):
 ## Getting paragraphs ##############################
 
 
-def get_paragraphs(s, language):
+def get_textitems(s, language, multiword_term_indexer=None):
     """
-    Get array of arrays of TextItems for the given string s.
-    """
-    # dt = DebugTimer("get_paragraphs", display=False)
+    Get array of TextItems for the string s.
 
+    The multiword_term_indexer is a big performance boost, but takes
+    time to initialize.
+    """
     # Hacky reset of state of ParsedToken state.
     # _Shouldn't_ be needed but doesn't hurt, even if it's lame.
     ParsedToken.reset_counters()
 
     cleaned = re.sub(r" +", " ", s)
     tokens = language.get_parsed_tokens(cleaned)
-    # dt.step("get_parsed_tokens")
+    terms = _find_all_terms_in_tokens(tokens, language, multiword_term_indexer)
+    textitems = calc_get_textitems(tokens, terms, language, multiword_term_indexer)
+    return textitems
+
+
+def get_multiword_indexer(language):
+    "Return indexer loaded with all multiword terms."
+    mw = MultiwordTermIndexer()
+    for r in _get_multiword_terms(language):
+        mw.add(r[1])
+    return mw
 
-    terms = _find_all_terms_in_tokens(tokens, language)
 
-    textitems = get_textitems(tokens, terms, language)
+def get_paragraphs(s, language):
+    """
+    Get array of arrays of TextItems for the given string s.
+
+    This doesn't use an indexer, as it should only be used
+    for a single page of text!
+    """
+    textitems = get_textitems(s, language)
 
     def _split_textitems_by_paragraph(textitems):
         "Split by ¶"
diff --git a/tests/unit/book/test_stats.py b/tests/unit/book/test_stats.py
index dcc118843..92604a7e2 100644
--- a/tests/unit/book/test_stats.py
+++ b/tests/unit/book/test_stats.py
@@ -6,7 +6,12 @@
 
 from lute.db import db
 from lute.term.model import Term, Repository
-from lute.book.stats import get_status_distribution, refresh_stats, mark_stale
+from lute.book.stats import (
+    calc_status_distribution,
+    refresh_stats,
+    mark_stale,
+    get_stats,
+)
 
 from tests.utils import make_text, make_book
 from tests.dbasserts import assert_record_count_equals, assert_sql_result
@@ -35,7 +40,7 @@ def scenario(language, fulltext, terms_and_statuses, expected):
     for ts in terms_and_statuses:
         add_term(language, ts[0], ts[1])
 
-    stats = get_status_distribution(b)
+    stats = calc_status_distribution(b)
 
     assert stats == expected
 
@@ -141,6 +146,34 @@ def test_stats_smoke_test(_test_book, spanish):
     )
 
 
+def test_get_stats_calculates_and_caches_stats(_test_book, spanish):
+    "Calculating stats is expensive, so store them on get."
+    add_terms(spanish, ["gato", "TENGO"])
+    assert_record_count_equals("bookstats", 0, "cache not loaded")
+    assert_stats([], "No stats cached at start.")
+
+    stats = get_stats(_test_book)
+    assert stats.BkID == _test_book.id
+    assert stats.distinctterms == 4
+    assert stats.distinctunknowns == 2
+    assert stats.unknownpercent == 50
+    assert (
+        stats.status_distribution
+        == '{"0": 2, "1": 2, "2": 0, "3": 0, "4": 0, "5": 0, "98": 0, "99": 0}'
+    )
+
+    assert_record_count_equals("bookstats", 1, "cache loaded")
+    assert_stats(
+        ["4; 2; 50; {'0': 2, '1': 2, '2': 0, '3': 0, '4': 0, '5': 0, '98': 0, '99': 0}"]
+    )
+    stats = get_stats(_test_book)
+    assert stats.BkID == _test_book.id
+    assert (
+        stats.status_distribution
+        == '{"0": 2, "1": 2, "2": 0, "3": 0, "4": 0, "5": 0, "98": 0, "99": 0}'
+    )
+
+
 def test_stats_calculates_rendered_text(_test_book, spanish):
     "Multiword term counted as one term."
     add_terms(spanish, ["tengo un"])

From e81b251c6c94094fbb23f5141d7423ae0c82bd54 Mon Sep 17 00:00:00 2001
From: Jeff Zohrab <jzohrab@gmail.com>
Date: Fri, 4 Oct 2024 16:07:43 -0700
Subject: [PATCH 3/5] Ajax in book stats graphs.

---
 lute/app_factory.py                   | 14 ---------
 lute/book/routes.py                   | 15 ++++++++++
 lute/static/css/styles.css            |  5 ++++
 lute/templates/book/tablelisting.html | 41 +++++++++++++++++----------
 4 files changed, 46 insertions(+), 29 deletions(-)

diff --git a/lute/app_factory.py b/lute/app_factory.py
index 3bf1b143d..b6806aaff 100644
--- a/lute/app_factory.py
+++ b/lute/app_factory.py
@@ -139,7 +139,6 @@ def index():
         if is_production and have_books and should_run_auto_backup:
             return redirect("/backup/backup", 302)
 
-        refresh_stats()
         warning_msg = backupservice.backup_warning(bkp_settings)
         backup_show_warning = (
             bkp_settings.backup_warn
@@ -147,11 +146,6 @@ def index():
             and warning_msg != ""
         )
 
-        # Disabling caching on this page so that book stats
-        # are recalculated, even if the user hits the browser
-        # "back" button after updating some terms.
-        # ref https://stackoverflow.com/questions/28627324/
-        #   disable-cache-on-a-specific-page-using-flask
         response = make_response(
             render_template(
                 "index.html",
@@ -164,26 +158,18 @@ def index():
                 language_choices=language_choices,
                 current_language_id=current_language_id,
                 is_production_data=is_production,
-                # Backup stats
                 backup_show_warning=backup_show_warning,
                 backup_warning_msg=warning_msg,
             )
         )
-        cc = "no-cache, no-store, must-revalidate, public, max-age=0"
-        response.headers["Cache-Control"] = cc
-        response.headers["Pragma"] = "no-cache"
-        response.headers["Expires"] = "0"
         return response
 
     @app.route("/refresh_all_stats")
     def refresh_all_stats():
         books_to_update = db.session.query(Book).filter(Book.archived == 0).all()
-
         for book in books_to_update:
             mark_stale(book)
-
         refresh_stats()
-
         return redirect("/", 302)
 
     @app.route("/wipe_database")
diff --git a/lute/book/routes.py b/lute/book/routes.py
index c1506ca7c..8271bbfe8 100644
--- a/lute/book/routes.py
+++ b/lute/book/routes.py
@@ -15,6 +15,7 @@
 from lute.book import service
 from lute.book.datatables import get_data_tables_list
 from lute.book.forms import NewBookForm, EditBookForm
+from lute.book.stats import get_stats
 import lute.utils.formutils
 from lute.db import db
 
@@ -188,3 +189,17 @@ def delete(bookid):
     db.session.delete(b)
     db.session.commit()
     return redirect("/", 302)
+
+
+@bp.route("/table_stats/<int:bookid>", methods=["GET"])
+def table_stats(bookid):
+    "Get the stats, return ajax."
+    b = DBBook.find(bookid)
+    stats = get_stats(b)
+    ret = {
+        "distinctterms": stats.distinctterms,
+        "distinctunknowns": stats.distinctunknowns,
+        "unknownpercent": stats.unknownpercent,
+        "status_distribution": stats.status_distribution,
+    }
+    return jsonify(ret)
diff --git a/lute/static/css/styles.css b/lute/static/css/styles.css
index 00faee502..343255f74 100644
--- a/lute/static/css/styles.css
+++ b/lute/static/css/styles.css
@@ -227,6 +227,7 @@ table#booktable:not(:has(a.completed_book)) a.book-title:before {
 
 .refreshed {
     background-image: url("../icn/waiting2.gif");
+    background-repeat: no-repeat;
 }
 
 .book-action-dropdown {
@@ -2034,6 +2035,10 @@ input[type="checkbox"][disabled] + label {
     padding-right: 1.5rem;
 }
 
+.book-stats-ajax-cell {
+    font-style: italic;
+}
+
 .status-bar-container-empty {
     border-color: #e6e6e6;
     background-color: #fff;
diff --git a/lute/templates/book/tablelisting.html b/lute/templates/book/tablelisting.html
index 3262798fd..71e95d2f1 100644
--- a/lute/templates/book/tablelisting.html
+++ b/lute/templates/book/tablelisting.html
@@ -93,10 +93,10 @@
         { name: "LgName", width: "10%", data: "LgName" },
         { name: "TagList", width: "10%", data: "TagList" },
         { name: "WordCount", width: "10%", data: "WordCount" },
-        { name: "UnknownPercent", render: render_book_stats_graph },
+        { name: "UnknownPercent", "searchable": false, render: render_book_stats_graph_placeholder },
         { width: "8%", "searchable": false, "orderable": false, render: render_book_actions },
       ],
-
+      createdRow: function(row, data, dataIndex) { ajax_in_book_stats(row, data, dataIndex); },
       ajax: {
         url: "/book/datatables/{{ status or 'active' }}",
         // Additional filters.  func calls are required to get the
@@ -217,21 +217,32 @@
     return `<a class="${book_title_classes.join(' ')}" href="/read/${bkid}">${row['BkTitle']}${pgfraction}</a>`;
   };
 
+  /* Replaced by the status graph after the ajax call kicked off by createdRow. */
+  let render_book_stats_graph_placeholder = function(data, type, row, meta) {
+    return `<span class="book-stats-ajax-cell"><img src="{{ url_for('static', filename='icn/waiting2.gif') }}" title="Calculating ..." /></span>`;
+  };
 
-  let render_book_stats_graph = function(data, type, row, meta) {
-    const empty_stats = `<div title="Please open the book to calculate stats">&nbsp;</div>`;
-    let statuscounts = row['StatusDistribution'];
-    if ((statuscounts ?? '')== '') {
-      return empty_stats;
-    }
-    try {
-      statuscounts = JSON.parse(statuscounts);
-    }
-    catch(err) {
-      console.log(`Invalid json: ${statuscounts}`);
-      return empty_stats;
-    }
+  /* Ajax called from createdRow datatables hook. */
+  let ajax_in_book_stats = function(row, data, dataIndex) {
+    var cell = $(row).find('.book-stats-ajax-cell');
+    $.ajax({
+      url: '/book/table_stats/' + data['BkID'],
+      method: 'GET',
+      success: function(response) {
+        cell.removeClass("refreshed");
+        const result = JSON.parse(response.status_distribution);
+        const graph = render_stats_graph(result);
+        cell.html(graph);
+      },
+      error: function() {
+        cell.text('Error loading data');
+        cell.removeClass("refreshed");
+      }
+    });
+  };
 
+  /* Generate stats graph <div> from statuscounts JSON. */
+  let render_stats_graph = function(statuscounts) {
     statuscounts["99"] = statuscounts["98"] + statuscounts["99"];
     delete statuscounts['98'];
     const totalcount = Object.values(statuscounts).reduce((acc, val) => acc + val, 0);

From 09b2038224331ebdd06f97d1b4479ffbef0becc3 Mon Sep 17 00:00:00 2001
From: Jeff Zohrab <jzohrab@gmail.com>
Date: Fri, 4 Oct 2024 16:07:58 -0700
Subject: [PATCH 4/5] Increase number of pages for book stats.

---
 lute/settings/routes.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/lute/settings/routes.py b/lute/settings/routes.py
index bbf3800d7..4f32d34c9 100644
--- a/lute/settings/routes.py
+++ b/lute/settings/routes.py
@@ -50,10 +50,8 @@ class UserSettingsForm(FlaskForm):
     stop_audio_on_term_form_open = BooleanField("Stop audio on term form open")
     stats_calc_sample_size = IntegerField(
         "Book stats page sample size",
-        validators=[InputRequired(), NumberRange(min=1, max=200)],
-        render_kw={
-            "title": "Number of pages to use for book stats calculation.  Max 200 for performance."
-        },
+        validators=[InputRequired(), NumberRange(min=1, max=500)],
+        render_kw={"title": "Number of pages to use for book stats calculation."},
     )
 
     mecab_path = StringField("MECAB_PATH environment variable")

From df9d40bc8497294133ba6057923563a8a316402e Mon Sep 17 00:00:00 2001
From: Jeff Zohrab <jzohrab@gmail.com>
Date: Fri, 4 Oct 2024 16:54:32 -0700
Subject: [PATCH 5/5] Fix lint.

---
 lute/book/stats.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/lute/book/stats.py b/lute/book/stats.py
index 27b620524..22ea68f7c 100644
--- a/lute/book/stats.py
+++ b/lute/book/stats.py
@@ -26,11 +26,11 @@ def calc_status_distribution(book):
     Does a full render of a small number of pages
     to calculate the distribution.
     """
-    txindex = 0
 
     # DebugTimer.clear_total_summary()
     # dt = DebugTimer("get_status_distribution", display=False)
 
+    txindex = 0
     if (book.current_tx_id or 0) != 0:
         for t in book.texts:
             if t.id == book.current_tx_id:
@@ -42,19 +42,17 @@ def calc_status_distribution(book):
 
     # Getting the individual paragraphs per page, and then combining,
     # is much faster than combining all pages into one giant page.
-    lang = book.language
-    mw = get_multiword_indexer(lang)
+    mw = get_multiword_indexer(book.language)
     textitems = []
     for tx in texts:
-        add_tis = [ti for ti in get_textitems(tx.text, lang, mw) if ti.is_word]
-        textitems.extend(add_tis)
+        textitems.extend(get_textitems(tx.text, book.language, mw))
     # # Old slower code:
     # text_sample = "\n".join([t.text for t in texts])
     # paras = get_paragraphs(text_sample, book.language) ... etc.
     # dt.step("get_paragraphs")
 
+    textitems = [ti for ti in textitems if ti.is_word]
     statterms = {0: [], 1: [], 2: [], 3: [], 4: [], 5: [], 98: [], 99: []}
-
     for ti in textitems:
         statterms[ti.wo_status or 0].append(ti.text_lc)