fix: DEV-2523: Support webhook data loading in NER ml backend example

KonstantinKorotaev · KonstantinKorotaev · commit 3a028e8b94ec · 2022-07-04T20:49:49.000+03:00
diff --git a/label_studio_ml/examples/ner/ner.py b/label_studio_ml/examples/ner/ner.py
@@ -25,9 +25,9 @@
 from transformers import AdamW, get_linear_schedule_with_warmup
 
 from label_studio_ml.model import LabelStudioMLBase
+from label_studio_ml.utils import get_annotated_dataset
 from utils import calc_slope
 
-
 logger = logging.getLogger(__name__)
 
 
@@ -342,7 +342,7 @@ def __init__(self, **kwargs):
         self.to_name = self.info['to_name'][0]
         self.value = self.info['inputs'][0]['value']
 
-        if not self.train_output:
+        if not self.train_output or (not self.train_output.get('model_path')):
             self.labels = self.info['labels']
         else:
             self.load(self.train_output)
@@ -464,6 +464,13 @@ def fit(
         warmup_steps=0, save_steps=50, dump_dataset=True, cache_dir='~/.heartex/cache', train_logs=None,
         **kwargs
     ):
+        # check if training is from web hook
+        if kwargs.get('data'):
+            project_id = kwargs['data']['project']['id']
+            completions = get_annotated_dataset(project_id)
+        # assert that there annotations
+        assert len(completions) > 0
+
         train_logs = train_logs or os.path.join(workdir, 'train_logs')
         os.makedirs(train_logs, exist_ok=True)
         logger.debug('Prepare models')
diff --git a/label_studio_ml/examples/simple_text_classifier/simple_text_classifier.py b/label_studio_ml/examples/simple_text_classifier/simple_text_classifier.py
@@ -40,7 +40,7 @@ def __init__(self, **kwargs):
         self.to_name = self.info['to_name'][0]
         self.value = self.info['inputs'][0]['value']
 
-        if not self.train_output:
+        if (not self.train_output) or (self.train_output and not self.train_output.get('model_file')):
             # If there is no trainings, define cold-started the simple TF-IDF text classifier
             self.reset_model()
             # This is an array of <Choice> labels
diff --git a/label_studio_ml/model.py b/label_studio_ml/model.py
@@ -29,13 +29,14 @@
 from rq.job import Job
 from colorama import Fore
 
-from label_studio_tools.core.utils.params import get_bool_env
+from label_studio_tools.core.utils.params import get_bool_env, get_env
 from label_studio_tools.core.label_config import parse_config
 from label_studio_tools.core.utils.io import get_local_path
 
 logger = logging.getLogger(__name__)
 
 LABEL_STUDIO_ML_BACKEND_V2_DEFAULT = False
+LABEL_STUDIO_STRICT_ERRORS = get_env("LS_STRICT_ERRORS", False)
 
 @attr.s
 class ModelWrapper(object):
@@ -189,12 +190,12 @@ def _get_result_from_job_id(self, job_id):
         if not os.path.exists(job_dir):
             logger.warning(f"=> Warning: {job_id} dir doesn't exist. "
                            f"It seems that you don't have specified model dir.")
-            return None
+            return None if LABEL_STUDIO_STRICT_ERRORS else {}
         result_file = os.path.join(job_dir, self.JOB_RESULT)
         if not os.path.exists(result_file):
             logger.warning(f"=> Warning: {job_id} dir doesn't contain result file. "
                            f"It seems that previous training session ended with error.")
-            return None
+            return None if LABEL_STUDIO_STRICT_ERRORS else {}
         logger.debug(f'Read result from {result_file}')
         with open(result_file) as f:
             result = json.load(f)
diff --git a/label_studio_ml/utils.py b/label_studio_ml/utils.py
@@ -1,4 +1,6 @@
+import json
 import logging
+import requests
 
 from PIL import Image
 
@@ -48,3 +50,17 @@ def get_image_local_path(url, image_cache_dir=None, project_dir=None, image_dir=
 
 def get_image_size(filepath):
     return Image.open(filepath).size
+
+
+def get_annotated_dataset(project_id, hostname=None, api_key=None):
+    """Just for demo purposes: retrieve annotated data from Label Studio API"""
+    if hostname is None:
+        hostname = get_env('HOSTNAME')
+    if api_key is None:
+        api_key = get_env('API_KEY')
+    download_url = f'{hostname.rstrip("/")}/api/projects/{project_id}/export'
+    response = requests.get(download_url, headers={'Authorization': f'Token {api_key}'})
+    if response.status_code != 200:
+        raise Exception(f"Can't load task data using {download_url}, "
+                        f"response status_code = {response.status_code}")
+    return json.loads(response.content)