Merge pull request #150 from sbesson/memoregenerator_setId_order

chris-allan · web-flow · commit aa533d662602 · 2025-01-17T15:47:25.000Z
Improve fileset distribution during the memo file regeneration
diff --git a/src/dist/memo_regenerator.sql b/src/dist/memo_regenerator.sql
@@ -1,18 +1,24 @@
-COPY (SELECT * FROM (
-    SELECT image.id AS imageId,
-           pixels.id AS pixelsId,
-           image.series,
-           pixelstype.value AS pixelstype,
-           pixels.sizeX,
-           pixels.sizeY,
-           pixels.sizeZ,
-           pixels.sizeC,
-           pixels.sizeT,
-           format.value,
-           rank() OVER (PARTITION BY fileset.id ORDER BY image.id) AS rank
-        FROM fileset
-            JOIN image ON fileset.id = image.fileset
-            JOIN pixels ON image.id = pixels.image
-            JOIN pixelstype ON pixels.pixelstype = pixelstype.id
-            JOIN format ON image.format = format.id
-) AS rank WHERE rank.rank = 1) TO STDOUT CSV;
+COPY (SELECT * FROM (
+    SELECT image.id AS imageId,
+           pixels.id AS pixelsId,
+           image.series,
+           pixelstype.value AS pixelstype,
+           pixels.sizeX,
+           pixels.sizeY,
+           pixels.sizeZ,
+           pixels.sizeC,
+           pixels.sizeT,
+           format.value,
+           e2.time - e1.time AS setId,
+           rank() OVER (PARTITION BY fileset.id ORDER BY image.id) AS rank
+        FROM fileset
+            JOIN image ON fileset.id = image.fileset
+            JOIN pixels ON image.id = pixels.image
+            JOIN pixelstype ON pixels.pixelstype = pixelstype.id
+            JOIN format ON image.format = format.id
+            JOIN event e2 on image.creation_id=e2.id
+            JOIN filesetjoblink on  filesetjoblink.parent=fileset.id
+            JOIN job on filesetjoblink.child=job.id
+            JOIN uploadjob on job.id=uploadjob.job_id
+            JOIN event e1 on job.update_id=e1.id
+)  AS query WHERE query.rank = 1 ORDER BY query.setId desc) TO STDOUT CSV;
diff --git a/src/dist/regen-memo-files.sh b/src/dist/regen-memo-files.sh
@@ -22,37 +22,42 @@
 usage() {
     echo "Usage:"
     echo "$0 [OPTIONS]"
-    echo "Regenerates bioformats memofiles"
+    echo "Regenerates Bio-Formats memo files in parallel"
+    echo
+    echo "This utility queries the OMERO database for a list of filesets, splits the output"
+    echo "into several input files and runs the memoregenerator utility using GNU parallel."
     echo
     echo "  OPTIONS:"
-    echo "    --help                display usage and exit"
-    echo "    --db                  database connection string"
-    echo "    --jobs                max number of jobs to parallelize"
-    echo "    --memoizer-home       Location of image-region-ms"
+    echo "    --batch-size          Maximum number of entries in each input file sent to parallel (default: 500)"
+    echo "    --cache-options       Memofile cache options [/path/to/dir | inplace] (required)"
+    echo "    --csv                 Bypass sql query and use this csv for image list"
+    echo "    --db                  Database connection string"
     echo "    --force-image-regen   Force regeneration of image list even if it exists already"
+    echo "    --help                Display usage and exit"
+    echo "    --jobs                Maximum number of jobs to parallelize (default: number of processing units available)"
+    echo "    --memoizer-home       Location of image-region micro-service (default: current directory)"
     echo "    --no-ask              Do not ask for confirmation"
     echo "    --no-wait             Do not wait to start generating -- DO IT NOW"
-    echo "    --cache-options       Memofile cache options [/path/to/dir | inplace]"
-    echo "    --batch-size          # of image files to split list into"
-    echo "    --csv                 Bypass sql query and use this csv for image list"
     echo
-    echo "Example:"
-    echo "  $0 --db postgresql://user:pass@host:port/db --jobs [12|max] --memoizer-home /opt/omero/OMERO.ms-image-region.current --cache-options /path/to/dir"
+    echo "Examples:"
+    echo "  Regenerate memo files using the current cache directory and all available CPUs"
+    echo "  $0 --cache-options inplace"
+    echo "  Regenerate memo files offline using a secondary cache directory and 4 CPUs"
+    echo "  $0 --jobs 4 --cache-options /OMERO/BioFormatsCache.$( date "+%Y%m%d" )"
+    echo "  Regenerate memo files offline using a secondary cache directory, all available CPUs and a database connection string"
+    echo "  $0 --db postgresql://user:pass@host:port/db --cache-options /OMERO/BioFormatsCache.$( date "+%Y%m%d" )"
     exit $1
 }
 
 run_split_parallel_os_dep() {
 set -x
   export JAVA_OPTS="-XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=rslt.${DATESTR} -Xmx2g -Dlogback.configurationFile=${MEMOIZER_HOME}/logback-memoizer.xml -Dprocessname=memoizer"
-  CENTOS_VERSION=$(cat /etc/centos-release |cut -f 3 -d' '|cut -d. -f 1)
   cd rslt.${DATESTR}
-  split -a 3 -l ${BATCH_SIZE} ${FULL_CSV} -d input.
-  PARALLEL_OPTS="error"
-  if [ "${CENTOS_VERSION}" = "6" ]; then
-    PARALLEL_OPTS="--halt 2 --gnu --eta --jobs ${JOBS} --joblog parallel-${JOBS}cpus.log --files --use-cpus-instead-of-cores --result . ${DRYRUN}"
-  else
-    PARALLEL_OPTS="--halt now,fail=1 --eta --jobs ${JOBS} --joblog parallel-${JOBS}cpus.log --files --use-cpus-instead-of-cores --results . ${DRYRUN}"
-  fi
+  # Split the CSV file into N * JOBS files of at most BATCH_SIZE entries using round-robin distribution
+  N=$(wc -l ${FULL_CSV} | awk '{print $1}')
+  NFILES=$(( (($N - 1) / ($BATCH_SIZE * $JOBS) + 1 ) * $JOBS ))
+  split -a 3 -n r/$NFILES ${FULL_CSV} -d input.
+  PARALLEL_OPTS="--halt now,fail=1 --eta --jobs ${JOBS} --joblog parallel-${JOBS}cpus.log --files --use-cpus-instead-of-cores --results . ${DRYRUN}"
 set -x
   /usr/bin/time -p -o timed parallel ${PARALLEL_OPTS} \
     ${MEMOIZER_HOME}/bin/memoregenerator \