Skip to content

Commit aa533d6

Browse files
authored
Merge pull request #150 from sbesson/memoregenerator_setId_order
Improve fileset distribution during the memo file regeneration
2 parents 7b085fb + 68cfcfc commit aa533d6

File tree

2 files changed

+47
-36
lines changed

2 files changed

+47
-36
lines changed

src/dist/memo_regenerator.sql

Lines changed: 24 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,24 @@
1-
COPY (SELECT * FROM (
2-
SELECT image.id AS imageId,
3-
pixels.id AS pixelsId,
4-
image.series,
5-
pixelstype.value AS pixelstype,
6-
pixels.sizeX,
7-
pixels.sizeY,
8-
pixels.sizeZ,
9-
pixels.sizeC,
10-
pixels.sizeT,
11-
format.value,
12-
rank() OVER (PARTITION BY fileset.id ORDER BY image.id) AS rank
13-
FROM fileset
14-
JOIN image ON fileset.id = image.fileset
15-
JOIN pixels ON image.id = pixels.image
16-
JOIN pixelstype ON pixels.pixelstype = pixelstype.id
17-
JOIN format ON image.format = format.id
18-
) AS rank WHERE rank.rank = 1) TO STDOUT CSV;
1+
COPY (SELECT * FROM (
2+
SELECT image.id AS imageId,
3+
pixels.id AS pixelsId,
4+
image.series,
5+
pixelstype.value AS pixelstype,
6+
pixels.sizeX,
7+
pixels.sizeY,
8+
pixels.sizeZ,
9+
pixels.sizeC,
10+
pixels.sizeT,
11+
format.value,
12+
e2.time - e1.time AS setId,
13+
rank() OVER (PARTITION BY fileset.id ORDER BY image.id) AS rank
14+
FROM fileset
15+
JOIN image ON fileset.id = image.fileset
16+
JOIN pixels ON image.id = pixels.image
17+
JOIN pixelstype ON pixels.pixelstype = pixelstype.id
18+
JOIN format ON image.format = format.id
19+
JOIN event e2 on image.creation_id=e2.id
20+
JOIN filesetjoblink on filesetjoblink.parent=fileset.id
21+
JOIN job on filesetjoblink.child=job.id
22+
JOIN uploadjob on job.id=uploadjob.job_id
23+
JOIN event e1 on job.update_id=e1.id
24+
) AS query WHERE query.rank = 1 ORDER BY query.setId desc) TO STDOUT CSV;

src/dist/regen-memo-files.sh

Lines changed: 23 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -22,37 +22,42 @@
2222
usage() {
2323
echo "Usage:"
2424
echo "$0 [OPTIONS]"
25-
echo "Regenerates bioformats memofiles"
25+
echo "Regenerates Bio-Formats memo files in parallel"
26+
echo
27+
echo "This utility queries the OMERO database for a list of filesets, splits the output"
28+
echo "into several input files and runs the memoregenerator utility using GNU parallel."
2629
echo
2730
echo " OPTIONS:"
28-
echo " --help display usage and exit"
29-
echo " --db database connection string"
30-
echo " --jobs max number of jobs to parallelize"
31-
echo " --memoizer-home Location of image-region-ms"
31+
echo " --batch-size Maximum number of entries in each input file sent to parallel (default: 500)"
32+
echo " --cache-options Memofile cache options [/path/to/dir | inplace] (required)"
33+
echo " --csv Bypass sql query and use this csv for image list"
34+
echo " --db Database connection string"
3235
echo " --force-image-regen Force regeneration of image list even if it exists already"
36+
echo " --help Display usage and exit"
37+
echo " --jobs Maximum number of jobs to parallelize (default: number of processing units available)"
38+
echo " --memoizer-home Location of image-region micro-service (default: current directory)"
3339
echo " --no-ask Do not ask for confirmation"
3440
echo " --no-wait Do not wait to start generating -- DO IT NOW"
35-
echo " --cache-options Memofile cache options [/path/to/dir | inplace]"
36-
echo " --batch-size # of image files to split list into"
37-
echo " --csv Bypass sql query and use this csv for image list"
3841
echo
39-
echo "Example:"
40-
echo " $0 --db postgresql://user:pass@host:port/db --jobs [12|max] --memoizer-home /opt/omero/OMERO.ms-image-region.current --cache-options /path/to/dir"
42+
echo "Examples:"
43+
echo " Regenerate memo files using the current cache directory and all available CPUs"
44+
echo " $0 --cache-options inplace"
45+
echo " Regenerate memo files offline using a secondary cache directory and 4 CPUs"
46+
echo " $0 --jobs 4 --cache-options /OMERO/BioFormatsCache.$( date "+%Y%m%d" )"
47+
echo " Regenerate memo files offline using a secondary cache directory, all available CPUs and a database connection string"
48+
echo " $0 --db postgresql://user:pass@host:port/db --cache-options /OMERO/BioFormatsCache.$( date "+%Y%m%d" )"
4149
exit $1
4250
}
4351

4452
run_split_parallel_os_dep() {
4553
set -x
4654
export JAVA_OPTS="-XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=rslt.${DATESTR} -Xmx2g -Dlogback.configurationFile=${MEMOIZER_HOME}/logback-memoizer.xml -Dprocessname=memoizer"
47-
CENTOS_VERSION=$(cat /etc/centos-release |cut -f 3 -d' '|cut -d. -f 1)
4855
cd rslt.${DATESTR}
49-
split -a 3 -l ${BATCH_SIZE} ${FULL_CSV} -d input.
50-
PARALLEL_OPTS="error"
51-
if [ "${CENTOS_VERSION}" = "6" ]; then
52-
PARALLEL_OPTS="--halt 2 --gnu --eta --jobs ${JOBS} --joblog parallel-${JOBS}cpus.log --files --use-cpus-instead-of-cores --result . ${DRYRUN}"
53-
else
54-
PARALLEL_OPTS="--halt now,fail=1 --eta --jobs ${JOBS} --joblog parallel-${JOBS}cpus.log --files --use-cpus-instead-of-cores --results . ${DRYRUN}"
55-
fi
56+
# Split the CSV file into N * JOBS files of at most BATCH_SIZE entries using round-robin distribution
57+
N=$(wc -l ${FULL_CSV} | awk '{print $1}')
58+
NFILES=$(( (($N - 1) / ($BATCH_SIZE * $JOBS) + 1 ) * $JOBS ))
59+
split -a 3 -n r/$NFILES ${FULL_CSV} -d input.
60+
PARALLEL_OPTS="--halt now,fail=1 --eta --jobs ${JOBS} --joblog parallel-${JOBS}cpus.log --files --use-cpus-instead-of-cores --results . ${DRYRUN}"
5661
set -x
5762
/usr/bin/time -p -o timed parallel ${PARALLEL_OPTS} \
5863
${MEMOIZER_HOME}/bin/memoregenerator \

0 commit comments

Comments
 (0)