Merge pull request #92 from LuCEresearchlab/minor-patch

malags · web-flow · commit 0fbb41949e7d · 2021-04-25T20:23:42.000Z
Added safety for dataset uploading
diff --git a/README.md b/README.md
@@ -35,8 +35,15 @@ In case there are issues due to dependencies try to rebuild the containers (will
 docker-compose up --build  # rebuild
 ```
 
-Note: The clustering might take a while, the clusters tend to finish close to each other don't panic if you see no
-progress for several minutes.
+Notes:
+
+- The clustering might take a while, the clusters tend to finish close to each other don't panic if you see no progress
+  for several minutes.
+- If the tagging-service is interrupted before completing the clustering it'll be necessary to manually log into the
+  database and delete the object under `dataset_db/dataset` with `dataset_id` equal to the one that was interrupted.
+  
+  If the entry is not deleted it'll be impossible to complete the clustering for it and the dataset will always result
+  as loading.
 
 Optimizations: It's possible to change the resources allocated to the python-service from `.env `, this will impact
 clustering performance, to change the resources modify the environment variables:
diff --git a/frontend/src/pages/tagging/DatasetSelection.tsx b/frontend/src/pages/tagging/DatasetSelection.tsx
@@ -59,7 +59,7 @@ function DatasetSelection() {
                 <TableBody>
                     {datasets.map((dataset: DatasetDesc) => {
                         const loading_cluster = dataset.clusters_computed != dataset.nr_questions
-                        const needed_time_s = 1000 * 60 * 2 * dataset.nr_questions
+                        const needed_time_s = 1000 * 90 * dataset.nr_questions
                         const started = new Date(dataset.creation_data)
                         const now = new Date()
 
diff --git a/tagging-service/flaskr/endpoints/upload_api.py b/tagging-service/flaskr/endpoints/upload_api.py
@@ -66,6 +66,10 @@ def thread_function(dataset):
 
         json_dataset = json.loads(uploaded_file.read())
 
+        dataset_from_db = get_dataset(dataset_id=json_dataset['dataset_id'])
+        if dataset_from_db is not None and dataset_from_db['clusters_computed'] != len(dataset_from_db['questions']):
+            return f'rejected file: {uploaded_file.name}, dataset still uploading'
+
         Thread(target=thread_function, args=(json_dataset,)).start()
 
         return f'uploaded file: {uploaded_file.name} successfully'