Skip to content

Commit 4f7e2bd

Browse files
committed
cluster_args
1 parent 736b1f4 commit 4f7e2bd

File tree

10 files changed

+272
-22
lines changed

10 files changed

+272
-22
lines changed

app/api/server.py

Lines changed: 69 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -236,6 +236,13 @@ def add_vec_features(
236236
articles_dict: dict[DBName, str] = {}
237237

238238
def init_vec_db() -> None:
239+
"""
240+
Asynchronously initializes the vector databases. This can be really
241+
slow and it could happen that qdrant is not available for a while.
242+
If any call times out this function will repeat trying to access the
243+
databases. Once everything is connected properly, the process queue
244+
is started.
245+
"""
239246
time.sleep(60.0) # NOTE: give qdrant plenty of time...
240247
try:
241248
tstart = time.monotonic()
@@ -288,11 +295,38 @@ def init_vec_db() -> None:
288295
th.start()
289296

290297
def parse_vdb(vdb_str: str) -> DBName:
298+
"""
299+
Converts a string into the external database name type.
300+
301+
Args:
302+
vdb_str (str): The string.
303+
304+
Raises:
305+
ValueError: If the string is not a valid external vector database
306+
name.
307+
308+
Returns:
309+
DBName: The external vector database name.
310+
"""
291311
if vdb_str not in DBS:
292312
raise ValueError(f"db ({vdb_str}) must be one of {DBS}")
293313
return cast(DBName, vdb_str)
294314

295315
def get_articles(vdb_str: str) -> str:
316+
"""
317+
Converts an external vector database name into an internal vector
318+
database name.
319+
320+
Args:
321+
vdb_str (str): The external database name.
322+
323+
Raises:
324+
ValueError: If the string is not a valid external vector database
325+
name or the databases have not been loaded yet.
326+
327+
Returns:
328+
str: The internal name for the given vector database.
329+
"""
296330
vdb = parse_vdb(vdb_str)
297331
res = articles_dict.get(vdb)
298332
if res:
@@ -304,12 +338,41 @@ def get_articles(vdb_str: str) -> str:
304338
raise ValueError("vector database is not ready yet!")
305339

306340
def get_articles_dict() -> dict[DBName, str]:
341+
"""
342+
Retrieve all loaded vector databases.
343+
344+
Returns:
345+
dict[DBName, str]: The external name mapped to the internal name.
346+
"""
307347
return dict(articles_dict)
308348

309349
@server.json_post(f"{prefix}/stats")
310350
@server.middleware(verify_readonly)
311351
@server.middleware(maybe_session)
312352
def _post_stats(_req: QSRH, rargs: ReqArgs) -> StatEmbed:
353+
"""
354+
The `/api/stats` endpoint provides document counts for semantic search
355+
queries. If the session cookie is not provided or invalid only public
356+
documents are considered for the stats.
357+
358+
@readonly
359+
@cookie (optional)
360+
361+
Args:
362+
_req (QSRH): The request.
363+
rargs (ReqArgs): The arguments.
364+
POST
365+
"fields": A set of field types expected to be returned.
366+
"filters": A dictionary of field types to lists of filter
367+
values. The date field, if given, expects a list of
368+
exactly two values, the start and end date
369+
(both inclusive). If the session cookie is missing or
370+
invalid the "status" filter gets overwritten to
371+
include "public" documents only.
372+
"vecdb": The vector database.
373+
Returns:
374+
StatEmbed: Vector database document counts.
375+
"""
313376
session: SessionInfo | None = rargs["meta"].get("session")
314377
args = rargs["post"]
315378
fields = set(args["fields"])
@@ -961,7 +1024,12 @@ def _post_tags_create(_req: QSRH, rargs: ReqArgs) -> AddQueue:
9611024
name: str | None = args.get("name")
9621025
bases: list[str] = list(args["bases"])
9631026
is_updating = to_bool(args.get("is_updating", True))
964-
tag_processor(name=name, bases=bases, is_updating=is_updating)
1027+
cluster_args = args.get("cluster_args", {})
1028+
tag_processor(
1029+
name=name,
1030+
bases=bases,
1031+
is_updating=is_updating,
1032+
cluster_args=cluster_args)
9651033
return {
9661034
"enqueued": True,
9671035
}

app/misc/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,3 +13,4 @@
1313
#
1414
# You should have received a copy of the GNU General Public License
1515
# along with this program. If not, see <https://www.gnu.org/licenses/>.
16+
"""Miscellaneous helper functions."""

app/misc/context.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,17 +13,34 @@
1313
#
1414
# You should have received a copy of the GNU General Public License
1515
# along with this program. If not, see <https://www.gnu.org/licenses/>.
16+
"""Helper function to determine the context of a given hit."""
1617
import re
1718

1819

1920
CONTEXT_SIZE = 20
21+
"""The desired context size in characters for both directions."""
2022
CONTEXT_MAX_EXPAND = 10
23+
"""The maximum expansion over the desired context size."""
2124
CONTEXT_END = re.compile(r"\b")
25+
"""Regex to find a suitable end of a context."""
2226
CONTEXT_START = re.compile(r"\b")
27+
"""Regex to find a suitable start of a context."""
2328
ELLIPSIS = "…"
29+
"""The ellipsis character."""
2430

2531

2632
def get_context(text: str, start: int, stop: int) -> str:
33+
"""
34+
Gets the context of the given hit.
35+
36+
Args:
37+
text (str): The full text.
38+
start (int): The hit start index.
39+
stop (int): The hit end index.
40+
41+
Returns:
42+
str: The hit with surrounding context.
43+
"""
2744
orig_start = start
2845
orig_stop = stop
2946
start = max(start - CONTEXT_SIZE, 0)

app/misc/env.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#
1414
# You should have received a copy of the GNU General Public License
1515
# along with this program. If not, see <https://www.gnu.org/licenses/>.
16+
"""Handling environment variables."""
1617
import os
1718
from typing import Literal
1819

@@ -25,6 +26,7 @@
2526
"SMIND_CFG",
2627
"UI_PATH",
2728
]
29+
"""Environment variables representing a file path or folder."""
2830
EnvStr = Literal[
2931
"APP_SECRET",
3032
"BLOGS_DB_DIALECT",
@@ -50,17 +52,20 @@
5052
"TANUKI",
5153
"WRITE_TOKEN",
5254
]
55+
"""Environment variables representing a string."""
5356
EnvInt = Literal[
5457
"BLOGS_DB_PORT",
5558
"LOGIN_DB_PORT",
5659
"PORT",
5760
"QDRANT_GRPC_PORT",
5861
"QDRANT_REST_PORT",
5962
]
63+
"""Environment variables representing an integer."""
6064
EnvBool = Literal[
6165
"NO_QDRANT",
6266
"HAS_LLAMA",
6367
]
68+
"""Environment variables representing a boolean value (true, false, 0, 1)."""
6469

6570

6671
def _envload(key: str, default: str | None) -> str:
@@ -73,16 +78,60 @@ def _envload(key: str, default: str | None) -> str:
7378

7479

7580
def envload_str(key: EnvStr, *, default: str | None = None) -> str:
81+
"""
82+
Loads a string environment variable.
83+
84+
Args:
85+
key (EnvStr): The variable name.
86+
default (str | None, optional): The default value. If None, the
87+
environment variable is mandatory. Defaults to None.
88+
89+
Returns:
90+
str: The value.
91+
"""
7692
return _envload(key, default)
7793

7894

7995
def envload_path(key: EnvPath, *, default: str | None = None) -> str:
96+
"""
97+
Loads a path or folder environment variable.
98+
99+
Args:
100+
key (EnvPath): The variable name.
101+
default (str | None, optional): The default value. If None, the
102+
environment variable is mandatory. Defaults to None.
103+
104+
Returns:
105+
str: The value.
106+
"""
80107
return _envload(key, default)
81108

82109

83110
def envload_int(key: EnvInt, *, default: int | None = None) -> int:
111+
"""
112+
Loads an integer environment variable.
113+
114+
Args:
115+
key (EnvInt): The variable name.
116+
default (int | None, optional): The default value. If None, the
117+
environment variable is mandatory. Defaults to None.
118+
119+
Returns:
120+
int: The value.
121+
"""
84122
return int(_envload(key, f"{default}"))
85123

86124

87125
def envload_bool(key: EnvBool, *, default: bool | None = None) -> bool:
126+
"""
127+
Loads a boolean environment variable (0, 1, true, false).
128+
129+
Args:
130+
key (EnvBool): The variable name.
131+
default (bool | None, optional): The default value. If None, the
132+
environment variable is mandatory. Defaults to None.
133+
134+
Returns:
135+
bool: The value.
136+
"""
88137
return to_bool(_envload(key, f"{default}"))

app/misc/io.py

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313
#
1414
# You should have received a copy of the GNU General Public License
1515
# along with this program. If not, see <https://www.gnu.org/licenses/>.
16+
"""I/O helper functions that handle a slow disk (or network disk) gracefully.
17+
"""
1618
import contextlib
1719
import errno
1820
import io
@@ -26,11 +28,23 @@
2628

2729

2830
MAIN_LOCK = threading.RLock()
31+
"""Lock for coordinating the wait on start when the (network) disk is not
32+
ready yet. Network disks can take a bit to get ready after a container is
33+
started."""
2934
STALE_FILE_RETRIES: list[float] = [0.1, 0.2, 0.5, 0.8, 1, 1.2, 1.5, 2, 3, 5]
35+
"""Wait times for retrying reads on stale files."""
3036
TMP_POSTFIX = ".~tmp"
37+
"""Postfix for temporary files."""
3138

3239

3340
def when_ready(fun: Callable[[], None]) -> None:
41+
"""
42+
Executes an I/O operation, retrying if the disk is not ready. After 120
43+
retries (~2min) the function gives up and lets the error go through.
44+
45+
Args:
46+
fun (Callable[[], None]): The I/O operation.
47+
"""
3448
with MAIN_LOCK:
3549
counter = 0
3650
while True:
@@ -46,6 +60,13 @@ def when_ready(fun: Callable[[], None]) -> None:
4660

4761

4862
def fastrename(src: str, dst: str) -> None:
63+
"""
64+
Moves a file or folder. Source and destination cannot be the same.
65+
66+
Args:
67+
src (str): The source file or folder.
68+
dst (str): The destination file or folder.
69+
"""
4970
src = os.path.abspath(src)
5071
dst = os.path.abspath(dst)
5172
if src == dst:
@@ -71,10 +92,26 @@ def fastrename(src: str, dst: str) -> None:
7192

7293

7394
def copy_file(from_file: str, to_file: str) -> None:
95+
"""
96+
Copies a file to a new destination.
97+
98+
Args:
99+
from_file (str): The source file.
100+
to_file (str): The destination file.
101+
"""
74102
shutil.copy(from_file, to_file)
75103

76104

77105
def normalize_folder(folder: str) -> str:
106+
"""
107+
Makes the path absolute and ensures that the folder exists.
108+
109+
Args:
110+
folder (str): The folder.
111+
112+
Returns:
113+
str: The absolute path.
114+
"""
78115
res = os.path.abspath(folder)
79116
when_ready(lambda: os.makedirs(res, mode=0o777, exist_ok=True))
80117
if not os.path.isdir(res):
@@ -83,16 +120,44 @@ def normalize_folder(folder: str) -> str:
83120

84121

85122
def normalize_file(fname: str) -> str:
123+
"""
124+
Makes the path absolute and ensures that the parent folder exists.
125+
126+
Args:
127+
fname (str): The file.
128+
129+
Returns:
130+
str: The absolute path.
131+
"""
86132
res = os.path.abspath(fname)
87133
normalize_folder(os.path.dirname(res))
88134
return res
89135

90136

91137
def get_mode(base: str, text: bool) -> str:
138+
"""
139+
Creates a mode string for the `open` function.
140+
141+
Args:
142+
base (str): The base mode string.
143+
text (bool): Whether it is a text file.
144+
145+
Returns:
146+
str: The mode string.
147+
"""
92148
return f"{base}{'' if text else 'b'}"
93149

94150

95151
def is_empty_file(fin: IO[Any]) -> bool:
152+
"""
153+
Cheecks whether the given file is empty.
154+
155+
Args:
156+
fin (IO[Any]): The file handle.
157+
158+
Returns:
159+
bool: True, if the file is empty.
160+
"""
96161
pos = fin.seek(0, io.SEEK_CUR)
97162
size = fin.seek(0, io.SEEK_END) - pos
98163
fin.seek(pos, io.SEEK_SET)
@@ -110,6 +175,15 @@ def ensure_folder(folder: None) -> None:
110175

111176

112177
def ensure_folder(folder: str | None) -> str | None:
178+
"""
179+
Ensures that the given folder exists.
180+
181+
Args:
182+
folder (str | None): The folder name or None.
183+
184+
Returns:
185+
str | None: The folder name or None.
186+
"""
113187
if folder is not None and not os.path.exists(folder):
114188
a_folder: str = folder
115189
when_ready(lambda: os.makedirs(a_folder, mode=0o777, exist_ok=True))

0 commit comments

Comments
 (0)