|
22 | 22 | usage() { |
23 | 23 | echo "Usage:" |
24 | 24 | echo "$0 [OPTIONS]" |
25 | | - echo "Regenerates bioformats memofiles" |
| 25 | + echo "Regenerates Bio-Formats memo files in parallel" |
| 26 | + echo |
| 27 | + echo "This utility queries the OMERO database for a list of filesets, splits the output" |
| 28 | + echo "into several input files and runs the memoregenerator utility using GNU parallel." |
26 | 29 | echo |
27 | 30 | echo " OPTIONS:" |
28 | | - echo " --help display usage and exit" |
29 | | - echo " --db database connection string" |
30 | | - echo " --jobs max number of jobs to parallelize" |
31 | | - echo " --memoizer-home Location of image-region-ms" |
| 31 | + echo " --batch-size Maximum number of entries in each input file sent to parallel (default: 500)" |
| 32 | + echo " --cache-options Memofile cache options [/path/to/dir | inplace] (required)" |
| 33 | + echo " --csv Bypass sql query and use this csv for image list" |
| 34 | + echo " --db Database connection string" |
32 | 35 | echo " --force-image-regen Force regeneration of image list even if it exists already" |
| 36 | + echo " --help Display usage and exit" |
| 37 | + echo " --jobs Maximum number of jobs to parallelize (default: number of processing units available)" |
| 38 | + echo " --memoizer-home Location of image-region micro-service (default: current directory)" |
33 | 39 | echo " --no-ask Do not ask for confirmation" |
34 | 40 | echo " --no-wait Do not wait to start generating -- DO IT NOW" |
35 | | - echo " --cache-options Memofile cache options [/path/to/dir | inplace]" |
36 | | - echo " --batch-size # of image files to split list into" |
37 | | - echo " --csv Bypass sql query and use this csv for image list" |
38 | 41 | echo |
39 | | - echo "Example:" |
40 | | - echo " $0 --db postgresql://user:pass@host:port/db --jobs [12|max] --memoizer-home /opt/omero/OMERO.ms-image-region.current --cache-options /path/to/dir" |
| 42 | + echo "Examples:" |
| 43 | + echo " Regenerate memo files using the current cache directory and all available CPUs" |
| 44 | + echo " $0 --cache-options inplace" |
| 45 | + echo " Regenerate memo files offline using a secondary cache directory and 4 CPUs" |
| 46 | + echo " $0 --jobs 4 --cache-options /OMERO/BioFormatsCache.$( date "+%Y%m%d" )" |
| 47 | + echo " Regenerate memo files offline using a secondary cache directory, all available CPUs and a database connection string" |
| 48 | + echo " $0 --db postgresql://user:pass@host:port/db --cache-options /OMERO/BioFormatsCache.$( date "+%Y%m%d" )" |
41 | 49 | exit $1 |
42 | 50 | } |
43 | 51 |
|
44 | 52 | run_split_parallel_os_dep() { |
45 | 53 | set -x |
46 | 54 | export JAVA_OPTS="-XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=rslt.${DATESTR} -Xmx2g -Dlogback.configurationFile=${MEMOIZER_HOME}/logback-memoizer.xml -Dprocessname=memoizer" |
47 | | - CENTOS_VERSION=$(cat /etc/centos-release |cut -f 3 -d' '|cut -d. -f 1) |
48 | 55 | cd rslt.${DATESTR} |
49 | | - split -a 3 -l ${BATCH_SIZE} ${FULL_CSV} -d input. |
50 | | - PARALLEL_OPTS="error" |
51 | | - if [ "${CENTOS_VERSION}" = "6" ]; then |
52 | | - PARALLEL_OPTS="--halt 2 --gnu --eta --jobs ${JOBS} --joblog parallel-${JOBS}cpus.log --files --use-cpus-instead-of-cores --result . ${DRYRUN}" |
53 | | - else |
54 | | - PARALLEL_OPTS="--halt now,fail=1 --eta --jobs ${JOBS} --joblog parallel-${JOBS}cpus.log --files --use-cpus-instead-of-cores --results . ${DRYRUN}" |
55 | | - fi |
| 56 | + # Split the CSV file into N * JOBS files of at most BATCH_SIZE entries using round-robin distribution |
| 57 | + N=$(wc -l ${FULL_CSV} | awk '{print $1}') |
| 58 | + NFILES=$(( (($N - 1) / ($BATCH_SIZE * $JOBS) + 1 ) * $JOBS )) |
| 59 | + split -a 3 -n r/$NFILES ${FULL_CSV} -d input. |
| 60 | + PARALLEL_OPTS="--halt now,fail=1 --eta --jobs ${JOBS} --joblog parallel-${JOBS}cpus.log --files --use-cpus-instead-of-cores --results . ${DRYRUN}" |
56 | 61 | set -x |
57 | 62 | /usr/bin/time -p -o timed parallel ${PARALLEL_OPTS} \ |
58 | 63 | ${MEMOIZER_HOME}/bin/memoregenerator \ |
|
0 commit comments