add documentation (#22)

- [x] add inline documentation - [x] inline documentation for api endpoints - [x] add curl scenarios - [x] add info about deploying in readme
UNDP-Accelerator-Labs · Aug 16, 2024 · b74a6c4 · b74a6c4
1 parent b4b4bf6
commit b74a6c4
Show file tree

Hide file tree

Showing 81 changed files with 5,267 additions and 77 deletions.
diff --git a/.gitignore b/.gitignore
@@ -29,6 +29,7 @@ buildtmp
 
 !deploy/*.json
 !deploy/graphs/*.json
+!sample.config.json
 deploy/default.env
 /userdata/
 

diff --git a/README.md b/README.md
@@ -5,6 +5,10 @@ This repo contains a python based API server that provides various NLP APIs.
 
 A public facing UI can be found [here](https://nlpapi.sdg-innovation-commons.org/search/).
 
+## API usage
+
+You can see some API scenarios [here](/scenarios.md).
+
 ## Setup Python
 
 In order to setup python install `python >= 3.11` and `conda`.
@@ -33,6 +37,55 @@ You can also call individual lints via make. See `make help`.
 to fill in the correct values. Then run `make run-local` again. Also, makes
 sure to add the `gguf` file into the `models` folder (to run the LLM).
 
+You can force a vector database to be loaded by setting `NO_QDRANT=false` in the
+env file. This, however, requires you to provide credentials for the qdrant
+database. It is not possible to access the qdrant database on Azure since it
+is private to the docker compose and doesn't expose any endpoints. However, you
+can create a local database by either running a docker image with it or by
+providing a `file://path/to/database` as qdrant URL.
+
+You can deactivate the LLM by setting `HAS_LLAMA=false` in the env file.
+
+## Diagnosing qdrant
+
+The qdrant UI is exposed on the server via the
+[`/qdrant/dashboard`](https://nlpapi.sdg-innovation-commons.org/qdrant/dashboard)
+endpoint.
+You will have to provide the internal qdrant api token for accessing the
+dashboard.
+
+## Running a docker compose locally
+
+Use the `make build-dev` command to create a local docker compose file. This
+requires a config file in the root folder: `docker.config.json`. After building
+the local docker compose you can run it via `make compose`. Both commands are
+unified as `make run-docker-api`. `sample.config.json` provides a sample config
+file. It does not specify a local vector database by default. You can replace
+the `null` with a valid local configuration, though.
+
+## Publishing the local copy
+
+Make sure to have a clean workspace and all your commits are pushed to github.
+Have your docker daemon running.
+Run `make azlogin` to log in to the azure account. Then, run
+`make publich-local` this will push the local images and end with the docker
+compose file printed to stdout. Copy the docker compose content to the
+deployment configuration tab on azure and save. This will update the app.
+
+## Publishing a main version
+
+When on the main branch call `make publish` to create the version tag. The
+CI will build and push all docker images. Once that is done, from the same
+commit, run `make build` locally to get the docker compose file. After that
+you can also retrieve the current docker compose file via `make show-compose`.
+Take the docker compose file output from either command and copy it to the
+deployment configuration tab on azure and save. This will update the app.
+
+## More below
+
+Some of the following information might be redundant and / or slightly
+outdated.
+
 ## Running the server
 
 In order to get the language API to work, create the tables by running:

diff --git a/app/__init__.py b/app/__init__.py
@@ -13,3 +13,4 @@
 #
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
+"""The NLP API app."""
diff --git a/app/__main__.py b/app/__main__.py
@@ -13,6 +13,7 @@
 #
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
+"""Start the NLP API app server."""
 import argparse
 import os
 import traceback
@@ -30,6 +31,12 @@
 
 
 def parse_args() -> argparse.Namespace:
+    """
+    Parses command line arguments.
+
+    Returns:
+        argparse.Namespace: The arguments.
+    """
     parser = argparse.ArgumentParser(
         prog=f"python -m {python_module()}",
         description="Run the API server")
@@ -55,6 +62,7 @@ def parse_args() -> argparse.Namespace:
 
 
 def run() -> None:
+    """Start the app server."""
     args = parse_args()
     env_file: str | None = args.env
     if env_file:

diff --git a/app/api/__init__.py b/app/api/__init__.py
@@ -13,3 +13,4 @@
 #
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
+"""Specifies the api endpoints, api types, and the module system."""
diff --git a/app/api/mod.py b/app/api/mod.py
@@ -13,19 +13,43 @@
 #
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
+"""Modules can be used through a unified api endpoint to compute multiple
+metrics for a given input string."""
 import uuid
 from collections.abc import Mapping
 from typing import Any
 
 
 class Module:
+    """
+    A module defines a metric for input strings. It can be accessed via the
+    `/api/extract` endpoint when it is registered in `server.py`.
+    """
     @staticmethod
     def name() -> str:
+        """
+        The name of the module.
+
+        Returns:
+            str: The name of the module. This is a parameter to the
+                `/api/extract` endpoint.
+        """
         raise NotImplementedError()
 
     def execute(
             self,
             input_str: str,
             user: uuid.UUID,
             args: dict[str, Any]) -> Mapping[str, Any]:
+        """
+        Executes the module and returns a dictionary with its results.
+
+        Args:
+            input_str (str): The input string.
+            user (uuid.UUID): The user that initiated the request.
+            args (dict[str, Any]): Arguments to the module.
+
+        Returns:
+            Mapping[str, Any]: The results.
+        """
         raise NotImplementedError()
diff --git a/app/api/mods/__init__.py b/app/api/mods/__init__.py
@@ -13,3 +13,4 @@
 #
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
+"""Implements the module system for the language and location apis."""
diff --git a/app/api/mods/lang.py b/app/api/mods/lang.py
@@ -13,6 +13,7 @@
 #
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
+"""The language module."""
 import uuid
 from collections.abc import Mapping
 from typing import Any
@@ -23,6 +24,7 @@
 
 
 class LanguageModule(Module):
+    """The language module."""
     def __init__(self, db: DBConnector) -> None:
         self._db = db
 

diff --git a/app/api/mods/loc.py b/app/api/mods/loc.py
@@ -13,6 +13,7 @@
 #
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
+"""The location module."""
 import uuid
 from collections.abc import Mapping
 from typing import Any
@@ -29,6 +30,7 @@
 
 
 class LocationModule(Module):
+    """The location module."""
     def __init__(
             self,
             db: DBConnector,

diff --git a/app/api/response_types.py b/app/api/response_types.py
@@ -13,6 +13,7 @@
 #
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
+"""The types for the api endpoint results."""
 from typing import TypedDict
 
 from app.system.autotag.autotag import TagClusterEntry
@@ -22,16 +23,12 @@
 from app.system.workqueues.queue import ProcessError, ProcessQueueStats
 
 
-SourceResponse = TypedDict('SourceResponse', {
-    "source": str,
-})
-SourceListResponse = TypedDict('SourceListResponse', {
-    "sources": list[str],
-})
 UserResponse = TypedDict('UserResponse', {
     "uuid": str | None,
     "name": str | None,
 })
+"""Provides information about the currently logged in user: the uuid and
+display name."""
 VersionResponse = TypedDict('VersionResponse', {
     "app_name": str,
     "app_commit": str,
@@ -45,86 +42,117 @@
     "deepdives": list[str],
     "error": list[str] | None,
 })
+"""Provides information about the currently running server."""
+HeartbeatResponse = TypedDict('HeartbeatResponse', {
+    "okay": bool,
+})
+"""Heartbeat response."""
 StatsResponse = TypedDict('StatsResponse', {
     "vecdbs": list[VecDBStat],
     "queues": list[QueueStat],
     "process_queue": ProcessQueueStats,
 })
+"""Statistics about the current state of the app. """
 URLInspectResponse = TypedDict('URLInspectResponse', {
     "url": str,
     "iso3": str | None,
 })
+"""Information about a url."""
 DateResponse = TypedDict('DateResponse', {
     "date": str | None,
 })
+"""The date or `None` if no date could be found."""
 Snippy = TypedDict('Snippy', {
     "text": str,
     "offset": int,
 })
+"""A snippet. It contains the text and the offset in the fulltext."""
 SnippyResponse = TypedDict('SnippyResponse', {
     "count": int,
     "snippets": list[Snippy],
 })
+"""The number of snippets and the actual snippets in a list."""
 BuildIndexResponse = TypedDict('BuildIndexResponse', {
     "new_index_count": int,
 })
+"""Indicates the number of newly created indices."""
 CollectionResponse = TypedDict('CollectionResponse', {
     "collection_id": int,
 })
+"""The collection id."""
 CollectionStats = TypedDict('CollectionStats', {
     "segments": list[SegmentStats],
 })
+"""Segment statistics. Provides information about the progress of each
+snippet."""
 CollectionJSON = TypedDict('CollectionJSON', {
     "id": int,
     "user": str,
     "name": str,
     "deep_dive_name": str,
     "is_public": bool,
 })
+"""Information about a collection."""
 CollectionListResponse = TypedDict('CollectionListResponse', {
     "collections": list[CollectionJSON],
 })
+"""All collections visible to the current user."""
 CollectionOptionsResponse = TypedDict('CollectionOptionsResponse', {
     "success": bool,
 })
+"""Whether the operation was successful."""
 DocumentResponse = TypedDict('DocumentResponse', {
     "document_ids": list[int],
 })
+"""List of document ids."""
 DocumentListResponse = TypedDict('DocumentListResponse', {
     "documents": list[DocumentObj],
     "is_readonly": bool,
 })
+"""All documents of a collection and information about whether the current
+user is allowed to modify the collection."""
 TagListResponse = TypedDict('TagListResponse', {
     "tags": dict[str, list[str]],
     "tag_group": int,
 })
+"""Tags (clusters) of a given set of main ids. The tag group is returned as
+well."""
 TagClustersResponse = TypedDict('TagClustersResponse', {
     "clusters": list[TagClusterEntry],
     "tag_group": int,
 })
+"""Clusters of a given tag group. The tag group is returned as well."""
 TagDocsResponse = TypedDict('TagDocsResponse', {
     "main_ids": list[str],
     "tag_group": int,
     "cluster_id": int,
 })
+"""All documents (main ids) with the given tag (cluster). The tag group and
+cluster id are returned as well."""
 FulltextResponse = TypedDict('FulltextResponse', {
     "content": str | None,
     "error": str | None,
 })
+"""Either the fulltext of a document or the error."""
 TitleResponse = TypedDict('TitleResponse', {
     "url": str | None,
     "title": str | None,
     "error": str | None,
 })
+"""Either the url and title of a document or the error."""
 TitlesResponse = TypedDict('TitlesResponse', {
     "info": list[TitleResponse],
 })
+"""Info about multiple documents."""
 RequeueResponse = TypedDict('RequeueResponse', {
     "done": bool,
 })
+"""Whether the queue operation was successful."""
 AddQueue = TypedDict('AddQueue', {
     "enqueued": bool,
 })
+"""Whether the element is successfully added to the queue."""
 ErrorProcessQueue = TypedDict('ErrorProcessQueue', {
     "errors": list[ProcessError],
 })
+"""Errors in the processing queue."""