diff --git a/README.md b/README.md index 2f705f5..a28edcc 100644 --- a/README.md +++ b/README.md @@ -35,8 +35,15 @@ In case there are issues due to dependencies try to rebuild the containers (will docker-compose up --build # rebuild ``` -Note: The clustering might take a while, the clusters tend to finish close to each other don't panic if you see no -progress for several minutes. +Notes: + +- The clustering might take a while, the clusters tend to finish close to each other don't panic if you see no progress + for several minutes. +- If the tagging-service is interrupted before completing the clustering it'll be necessary to manually log into the + database and delete the object under `dataset_db/dataset` with `dataset_id` equal to the one that was interrupted. + + If the entry is not deleted it'll be impossible to complete the clustering for it and the dataset will always result + as loading. Optimizations: It's possible to change the resources allocated to the python-service from `.env `, this will impact clustering performance, to change the resources modify the environment variables: diff --git a/frontend/src/pages/tagging/DatasetSelection.tsx b/frontend/src/pages/tagging/DatasetSelection.tsx index 8fb746c..c954721 100644 --- a/frontend/src/pages/tagging/DatasetSelection.tsx +++ b/frontend/src/pages/tagging/DatasetSelection.tsx @@ -59,7 +59,7 @@ function DatasetSelection() { {datasets.map((dataset: DatasetDesc) => { const loading_cluster = dataset.clusters_computed != dataset.nr_questions - const needed_time_s = 1000 * 60 * 2 * dataset.nr_questions + const needed_time_s = 1000 * 90 * dataset.nr_questions const started = new Date(dataset.creation_data) const now = new Date() diff --git a/tagging-service/flaskr/endpoints/upload_api.py b/tagging-service/flaskr/endpoints/upload_api.py index 166bbed..8f9fc62 100644 --- a/tagging-service/flaskr/endpoints/upload_api.py +++ b/tagging-service/flaskr/endpoints/upload_api.py @@ -66,6 +66,10 @@ def thread_function(dataset): json_dataset = json.loads(uploaded_file.read()) + dataset_from_db = get_dataset(dataset_id=json_dataset['dataset_id']) + if dataset_from_db is not None and dataset_from_db['clusters_computed'] != len(dataset_from_db['questions']): + return f'rejected file: {uploaded_file.name}, dataset still uploading' + Thread(target=thread_function, args=(json_dataset,)).start() return f'uploaded file: {uploaded_file.name} successfully'