3
3
"""
4
4
5
5
import json
6
- from lute .read .render .service import get_paragraphs
6
+ from lute .read .render .service import get_multiword_indexer , get_textitems
7
7
from lute .db import db
8
8
from lute .models .book import Book
9
9
from lute .models .setting import UserSetting
@@ -19,51 +19,41 @@ def _last_n_pages(book, txindex, n):
19
19
return texts [- n :]
20
20
21
21
22
- def get_status_distribution (book ):
22
+ def calc_status_distribution (book ):
23
23
"""
24
- Return statuses and count of unique words per status.
24
+ Calculate statuses and count of unique words per status.
25
25
26
26
Does a full render of a small number of pages
27
27
to calculate the distribution.
28
28
"""
29
- txindex = 0
30
29
31
- # dt = DebugTimer("get_status_distribution", display=True)
30
+ # DebugTimer.clear_total_summary()
31
+ # dt = DebugTimer("get_status_distribution", display=False)
32
32
33
+ txindex = 0
33
34
if (book .current_tx_id or 0 ) != 0 :
34
35
for t in book .texts :
35
36
if t .id == book .current_tx_id :
36
37
break
37
38
txindex += 1
38
39
39
- # Use a sample of pages to speed up stats count.
40
40
sample_size = int (UserSetting .get_value ("stats_calc_sample_size" ) or 5 )
41
41
texts = _last_n_pages (book , txindex , sample_size )
42
42
43
43
# Getting the individual paragraphs per page, and then combining,
44
44
# is much faster than combining all pages into one giant page.
45
- paras = [get_paragraphs (t .text , book .language ) for t in texts ]
45
+ mw = get_multiword_indexer (book .language )
46
+ textitems = []
47
+ for tx in texts :
48
+ textitems .extend (get_textitems (tx .text , book .language , mw ))
46
49
# # Old slower code:
47
50
# text_sample = "\n".join([t.text for t in texts])
48
- # paras = get_paragraphs(text_sample, book.language)
49
-
51
+ # paras = get_paragraphs(text_sample, book.language) ... etc.
50
52
# dt.step("get_paragraphs")
51
- # DebugTimer.total_summary()
52
-
53
- def flatten_list (nested_list ):
54
- result = []
55
- for item in nested_list :
56
- if isinstance (item , list ):
57
- result .extend (flatten_list (item ))
58
- else :
59
- result .append (item )
60
- return result
61
-
62
- text_items = [ti for ti in flatten_list (paras ) if ti .is_word ]
63
53
54
+ textitems = [ti for ti in textitems if ti .is_word ]
64
55
statterms = {0 : [], 1 : [], 2 : [], 3 : [], 4 : [], 5 : [], 98 : [], 99 : []}
65
-
66
- for ti in text_items :
56
+ for ti in textitems :
67
57
statterms [ti .wo_status or 0 ].append (ti .text_lc )
68
58
69
59
stats = {}
@@ -72,6 +62,9 @@ def flatten_list(nested_list):
72
62
statterms [statusval ] = uniques
73
63
stats [statusval ] = len (uniques )
74
64
65
+ # dt.step("compiled")
66
+ # DebugTimer.total_summary()
67
+
75
68
return stats
76
69
77
70
@@ -83,8 +76,7 @@ class BookStats(db.Model):
83
76
"The stats table."
84
77
__tablename__ = "bookstats"
85
78
86
- id = db .Column (db .Integer , primary_key = True )
87
- BkID = db .Column (db .Integer )
79
+ BkID = db .Column (db .Integer , primary_key = True )
88
80
distinctterms = db .Column (db .Integer )
89
81
distinctunknowns = db .Column (db .Integer )
90
82
unknownpercent = db .Column (db .Integer )
@@ -100,7 +92,7 @@ def refresh_stats():
100
92
)
101
93
books = [b for b in books_to_update if b .is_supported ]
102
94
for book in books :
103
- stats = _get_stats (book )
95
+ stats = _calculate_stats (book )
104
96
_update_stats (book , stats )
105
97
106
98
@@ -111,31 +103,43 @@ def mark_stale(book):
111
103
db .session .commit ()
112
104
113
105
114
- def _get_stats (book ):
106
+ def get_stats (book ):
107
+ "Gets stats from the cache if available, or calculates."
108
+ bk_id = book .id
109
+ stats = db .session .query (BookStats ).filter_by (BkID = bk_id ).first ()
110
+ if stats is None :
111
+ newstats = _calculate_stats (book )
112
+ _update_stats (book , newstats )
113
+ stats = db .session .query (BookStats ).filter_by (BkID = bk_id ).first ()
114
+ return stats
115
+
116
+
117
+ def _calculate_stats (book ):
115
118
"Calc stats for the book using the status distribution."
116
- status_distribution = get_status_distribution (book )
119
+ status_distribution = calc_status_distribution (book )
117
120
unknowns = status_distribution [0 ]
118
121
allunique = sum (status_distribution .values ())
119
122
120
123
percent = 0
121
124
if allunique > 0 : # In case not parsed.
122
125
percent = round (100.0 * unknowns / allunique )
123
126
124
- sd = json .dumps (status_distribution )
125
-
126
- # Any change in the below fields requires a change to
127
- # update_stats as well, query insert doesn't check field order.
128
- return [allunique , unknowns , percent , sd ]
127
+ return {
128
+ "allunique" : allunique ,
129
+ "unknowns" : unknowns ,
130
+ "percent" : percent ,
131
+ "distribution" : json .dumps (status_distribution ),
132
+ }
129
133
130
134
131
135
def _update_stats (book , stats ):
132
136
"Update BookStats for the given book."
133
- new_stats = BookStats (
134
- BkID = book . id ,
135
- distinctterms = stats [ 0 ],
136
- distinctunknowns = stats [1 ],
137
- unknownpercent = stats [2 ],
138
- status_distribution = stats [3 ],
139
- )
140
- db .session .add (new_stats )
137
+ s = db . session . query ( BookStats ). filter_by ( BkID = book . id ). first ()
138
+ if s is None :
139
+ s = BookStats ( BkID = book . id )
140
+ s . distinctterms = stats ["allunique" ]
141
+ s . distinctunknowns = stats ["unknowns" ]
142
+ s . unknownpercent = stats ["percent" ]
143
+ s . status_distribution = stats [ "distribution" ]
144
+ db .session .add (s )
141
145
db .session .commit ()
0 commit comments