-
Notifications
You must be signed in to change notification settings - Fork 16
/
Copy pathwochenende_postprocess.sh
388 lines (346 loc) · 13.3 KB
/
wochenende_postprocess.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
#!/bin/bash
# Automated postprocessing of results from the Wochenende pipeline, with wochenende reporting and haybaler.
# Authors: Colin Davenport, Sophia Poertner
version="0.36, Jan 2022"
#Changelog
#0.36 - add random number to allow concurrent independent fileprep jobs
#0.35 - update raspir and attempt raspir samtools depth parallel speedup
#0.34 - resolve bug with ordering of conda envs with -a option (thanks @irosenboom, @vangreuj )
#0.33 - add -a all option, test refactoring with global scheduler setup
#0.32 - add command line args
#0.31 - remove --no-plots
#0.30 - remove growth rate temp files
#0.29 - solve growth rate problems
#0.28 - check important env vars are set.
#0.27 - add growth rate estimation by Tom Wehrbein @Leibniz University Hannover
#0.26 - don't save voluminous sambamba depth output to log
#0.25 - add viral read extraction
#0.24 - use bash config.yaml parsing
#0.23 - handle mq20 output files
#0.22 - add heat trees
#0.21 - attempt recovery for second runs to copy data from csv or txt subdirs into haybaler dir
#0.20 - add haybaler heat tree support
#0.19 - update haybaler copying and add double square brackets for bash ifs
#0.18 - make wochenende_plot optional with --no-plots
#0.17 - check directories and improve haybaler integration
#0.16 - use Haybaler update runbatch_heatmaps.sh
#0.15 - check for files to cleanup before moving
#0.14 - add haybaler env and use this
#0.13 - copy filled directory plots and reporting into current directory, if missing
#0.12 - make prerequisite docs clearer
#0.11 - add variable for random
#0.10 - initial commits
echo "INFO: Postprocess Wochenende BAM and bam.txt files for plotting, reporting and haybaler integration"
echo "INFO: Version: " $version
echo "INFO: Usage: bash wochenende_postprocess.sh args"
echo "INFO: Usage: bash wochenende_postprocess.sh -r -h -s -p -g"
echo "INFO: Remember to run this using the haybaler conda environment if available - we attempt to load this in the script"
echo "INFO: ####### "
echo "INFO: Usage: Make sure the directories plots/ and reporting/ exist and are filled"
echo "INFO: eg. run: bash get_wochenende.sh to get the relevant files"
echo "INFO: ####### "
# Setup conda, directories and SLURM using data parsed from config.yaml
source $WOCHENENDE_DIR/scripts/parse_yaml.sh
eval $(parse_yaml $WOCHENENDE_DIR/config.yaml)
haybaler_dir=$HAYBALER_DIR
wochenende_dir=$WOCHENENDE_DIR
# Set and activate existing conda env
. $CONDA_SH_PATH
conda activate $WOCHENENDE_CONDA_ENV_NAME
# Setup job scheduler
# use SLURM job scheduler (yes, no)
if [[ "${USE_SLURM}" == "yes" && "${USE_CUSTOM_SCHED}" == "yes" ]]; then
echo "Config warning, both USE_SLURM and USE_CUSTOM_SCHED set. Defaulting to SLURM"
fi
# Setup job scheduler
# use SLURM job scheduler (yes, no)
if [[ "${USE_CUSTOM_SCHED}" == "yes" ]]; then
#echo USE_CUSTOM_SCHED set"
scheduler=$CUSTOM_SCHED_CUSTOM_PARAMS
fi
if [[ "${USE_SLURM}" == "yes" ]]; then
#echo USE_SLURM set"
scheduler=$SLURM_CUSTOM_PARAMS
fi
# check if env variables could be defined.
if [[ -z "${WOCHENENDE_DIR}" || -z "${HAYBALER_DIR}" ]]; then
echo "ERROR: WOCHENENDE_DIR or HAYBALER_DIR was not found. Use setup.sh in the Wochenende project to set the directory properly. Exiting! "
exit 1
fi
# Setup variable sleep duration. Might be useful to set higher for some big projects where the filesystem IO becomes too much.
# The wait command may fail for some SLURM jobs in these cases simply because files have not yet been written in time.
sleeptimer=12
#sleeptimer=120
# get current dir containing Wochenende BAM and bam.txt output
bamDir=$(pwd)
# Cleanup previous results to a directory with a random name which includes a number, calculated here.
rand_number=$RANDOM
# Save output log in directory containing bams and preprocess script
output_log=$bamDir"/postprocess_"$(date +%s)".log"
echo "INFO: output_log: " $output_log
### Check if required directories/files exist, copy if missing ###
if [[ ! -d "reporting" ]]
then
echo "INFO: Copying directory reporting, as it was missing! Use get_wochenende.sh to set up Wochenende properly."
cp -R $wochenende_dir/reporting .
fi
if [[ ! -d "plots" ]]
then
echo "INFO: Copying directory plots, as it was missing!"
cp -R $wochenende_dir/plots .
fi
if [[ ! -d "extract" ]]
then
echo "INFO: Copying directory extract, as it was missing!"
cp -R $wochenende_dir/extract .
fi
if [[ ! -d "growth_rate" ]]
then
echo "INFO: Copying directory growth_rate, as it was missing!"
cp -R $wochenende_dir/growth_rate .
fi
if [[ ! -f "reporting/ref.tmp" ]]
then
echo "INFO: Missing file reporting/ref.tmp , attempting to copy ./ref.tmp to reporting/ref.tmp"
cp ref.tmp reporting/ref.tmp
fi
## Set command line arguments
while [[ "$#" -gt 0 ]]; do
case $1 in
-a) runAll="1"; ;;
-r) runReporting="1"; ;;
-s) runRaspir="1"; ;;
-p) runPlotting="1"; ;;
-g) runGrowth="1"; ;;
-h) runHaybaler="1"; ;;
*) echo "Unknown parameter passed: $1"; exit 1 ;;
esac
shift
done
if [[ $runAll == "1" ]]
then
runReporting="1"
runRaspir="1"
runPlotting="1"
runGrowth="1"
runHaybaler="1"
fi
echo "###################################################"
echo "INFO: Selected stages via command line args (1 means activated)"
echo "INFO: Arguments Run all stages (-a) : $runAll"
echo "INFO: Arguments Run reporting (-r) : $runReporting"
echo "INFO: Arguments Run haybaler (-h) : $runHaybaler"
echo "INFO: Arguments Run Raspir (-s) : $runRaspir"
echo "INFO: Arguments Run plotting (-p) : $runPlotting"
echo "INFO: Arguments Run growth rate (-g) : $runGrowth"
echo "INFO: Starting Wochenende_postprocess"
echo "INFO: Current directory" $bamDir >>$output_log 2>&1
echo "INFO: Current directory" $bamDir
echo "INFO: Current directory" $bamDir >>$output_log 2>&1
sleep 3
# Run simple filtering and sorting script and multiqc
bash runbatch_metagen_awk_filter.sh
wait
# Check args
if [[ $runHaybaler == "1" ]]; then
#echo "INFO: Haybaler requires Wochenende reporting"
runReporting="1"
fi
# Run reporting
if [[ $runReporting == "1" ]]; then
echo "INFO: Started Wochenende reporting"
echo "INFO: Started Wochenende reporting" >>$output_log 2>&1
cd $bamDir
cd reporting
cp ../*.bam.txt .
bash run_Wochenende_reporting_SLURM.sh >>$output_log 2>&1
wait
echo "INFO: Sleeping for "$sleeptimer "to allow writes to complete."
sleep $sleeptimer
echo "INFO: Completed Wochenende reporting"
fi
# Run haybaler
if [[ $runHaybaler == "1" ]]; then
echo "INFO: Start Haybaler"
echo "INFO: Start Haybaler" >>$output_log 2>&1
if [[ ! -d "haybaler" ]]
then
mkdir haybaler
fi
# count files and only copy files that exist to avoid missing files errors
count_mq20=`ls -1 *mq20.bam*us*.csv 2>/dev/null | wc -l`
count_mq30=`ls -1 *mq30.bam*us*.csv 2>/dev/null | wc -l`
count_dup=`ls -1 *dup.bam*us*.csv 2>/dev/null | wc -l`
count=`ls -1 *.bam*us*.csv 2>/dev/null | wc -l`
if [[ $count_mq30 != 0 ]]
then
cp *mq30.bam*us*.csv haybaler
elif [[ $count_mq20 != 0 ]]
then
cp *mq20.bam*us*.csv haybaler
elif [[ $count_dup != 0 ]]
then
cp *dup.bam*us*.csv haybaler
elif [[ $count != 0 ]]
then
cp *.bam*us*.csv haybaler
else
echo "WARNING: No bam*us*.csv found to process for haybaler"
echo "INFO: Attempting to find and copy bam*us*.csv to haybaler"
cp txt/*.bam*us*.csv haybaler
cp csv/*.bam*us*.csv haybaler
fi
cd haybaler
conda activate $HAYBALER_CONDA_ENV_NAME
cp $haybaler_dir/*.sh .
cp $haybaler_dir/*.py .
cp $haybaler_dir/*.R .
bash run_haybaler.sh $haybaler_dir >>$output_log 2>&1
wait
cp $haybaler_dir/runbatch_heatmaps.sh haybaler_output/ && cp $haybaler_dir/*.R haybaler_output/
cp $haybaler_dir/*tax* haybaler_output/
cp $haybaler_dir/*tree* haybaler_output/
echo "INFO: Attempting to filter results and create heatmaps. Requires R installation."
cd haybaler_output
bash runbatch_heatmaps.sh >>$output_log 2>&1
echo "INFO: Attempting to add taxonomy. Requires pytaxonkit."
bash run_haybaler_tax.sh >>$output_log 2>&1
echo "INFO: Attempting to create heat-trees. Requires R installation and packages: packages = c("metacoder", "taxa", "dplyr", "tibble", "ggplot2")."
bash run_heattrees.sh >>$output_log 2>&1
cd ..
cd ..
echo "INFO: Sleeping for "$sleeptimer "to allow writes to complete."
sleep $sleeptimer
echo "INFO: Start csv to xlsx conversion"
bash runbatch_csv_to_xlsx.sh >>$output_log 2>&1
wait
echo "INFO: Sleeping for "$sleeptimer "to allow writes to complete."
sleep $sleeptimer
echo "INFO: Completed Haybaler"
fi
# leave haybaler conda env, reactivate Wochenende conda env
conda deactivate
conda activate $WOCHENENDE_CONDA_ENV_NAME
if [[ $runReporting == "1" ]]; then
echo "INFO: Start cleanup reporting"
cd $bamDir
cd reporting
# create backup, move folders from previous reporting run to a directory (if the txt directory exists already)
mkdir reporting_$rand_number
if [[ -d "txt" ]]
then
mv txt/ csv/ xlsx/ reporting_$rand_number
fi
# make and fill current folders from this run
mkdir txt csv xlsx
# cleanup .txt, .csv and .xlsx files if they exist in directory
count=`ls -1 *.txt 2>/dev/null | wc -l`
if [[ $count != 0 ]]
then
mv *.txt txt
fi
count=`ls -1 *.csv 2>/dev/null | wc -l`
if [[ $count != 0 ]]
then
mv *.csv csv
fi
count=`ls -1 *.xlsx 2>/dev/null | wc -l`
if [[ $count != 0 ]]
then
mv *.xlsx xlsx
fi
cd $bamDir
echo "INFO: Completed cleanup reporting"
fi
if [[ $runPlotting == "1" ]]
then
echo "INFO: Started Sambamba depth"
bash runbatch_sambamba_depth.sh >>/dev/null 2>&1
wait
echo "INFO: Sleeping for "$sleeptimer "to allow writes to complete."
sleep $sleeptimer
bash runbatch_metagen_window_filter.sh >>$output_log 2>&1
wait
echo "INFO: Completed Sambamba depth and filtering"
fi
# Growth rate
if [[ $runGrowth == "1" ]]; then
echo "INFO: Started bacterial growth rate analysis"
echo "INFO: Started bacterial growth rate analysis" >>$output_log 2>&1
cd $bamDir
cd growth_rate/
echo "INFO: Cleanup original results" >>$output_log 2>&1
rm -rf fit_results >>/dev/null 2>&1
rm -rf *_subsamples >>/dev/null 2>&1
bash runbatch_bed_to_csv.sh >>$output_log 2>&1
wait
echo "INFO: Sleeping for "$sleeptimer "to allow writes to complete."
sleep $sleeptimer
bash run_reproduction_determiner.sh >>$output_log 2>&1
wait
echo "INFO: Sleeping for "$sleeptimer "to allow writes to complete."
sleep $sleeptimer
echo "INFO: Files produced by growth rate" >>$output_log 2>&1
ls "fit_results/output/*" >>$output_log 2>&1
cd $bamDir
echo "INFO: Completed bacterial growth rate analysis, see growth_rate/fit_results/output for results"
fi
# Plots
if [[ $runPlotting == "1" ]]; then
echo "INFO: Started Wochenende plot"
echo "INFO: Started Wochenende plot" >>$output_log 2>&1
cd $bamDir
cd plots
cp ../*_window.txt .
cp ../*_window.txt.filt.csv .
bash runbatch_wochenende_plot.sh >>$output_log 2>&1
#wait
echo "INFO: Sleeping for "$sleeptimer "to allow writes to complete."
sleep $sleeptimer
cd $bamDir
echo "INFO: Completed Wochenende plot"
echo "INFO: - Extracting selected human viral pathogen reads"
echo "INFO: - Extracting selected human viral pathogen reads" >>$output_log 2>&1
cd $bamDir
bash extract_viral_reads.sh >>$output_log 2>&1
wait
echo "INFO: Sleeping for "$sleeptimer "to allow writes to complete."
sleep $sleeptimer
fi
# raspir
if [[ $runRaspir == "1" ]]; then
echo "INFO: Run raspir by M. Pust"
echo "INFO: Run raspir by M. Pust" >>$output_log 2>&1
conda activate $HAYBALER_CONDA_ENV_NAME
cd $bamDir/raspir
#cleanup
rm -f *raspir*.csv >/dev/null 2>/dev/null
echo "INFO: link BAM files in" >>$output_log 2>&1
bash batch_create_links.sh >>$output_log 2>&1
echo "INFO: Start preparing the files for raspir. Now with SLURM loop" >>$output_log 2>&1
# Use random number to create unique fileprep job names and avoid clashes when multiple fileprep jobs are running
rand=$RANDOM
for input_bam in `ls *.bam`
do
if [[ "${USE_SLURM}" == "yes" ]];
then
scheduler=$SLURM_CUSTOM_PARAMS
# SLURM job scheduler- srun will not work here, need sbatch
sbatch -J fileprep$rand run_SLURM_file_prep.sh $input_bam >>$output_log 2>&1
else
# local job submission
bash run_SLURM_file_prep.sh $input_bam >>$output_log 2>&1
fi
done
echo "INFO: waiting for raspir file prep jobs to complete"
srun --dependency=singleton --job-name=fileprep$rand sleep $sleeptimer
wait
echo "INFO: Run raspir" >>$output_log 2>&1
sbatch run_raspir_SLURM.sh >>$output_log 2>&1
echo "INFO: Remove soft linked BAM files" >>$output_log 2>&1
bash batch_remove_links.sh >>$output_log 2>&1
cd $bamDir
echo "INFO: Raspir module completed"
fi
echo "INFO: Remember to check the output log for errors at: " $output_log
echo "INFO: ########### Completed Wochenende_postprocess #############"