-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy path_processing_MB_0.2a.sh
executable file
·429 lines (343 loc) · 14.9 KB
/
_processing_MB_0.2a.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
#!/bin/bash
###################################################################
# Bioinformatic data processing for metabarcoding
# for 16S V4 data, with amplicons generated by following Kozich et al. 2013 AEM
# by Alexander Keller (LMU München)
# email: [email protected]
# if you use this script, please kindly cite this article:
# https://doi.org/10.1098/rstb.2021.0171
###################################################################
githash=$(git log -1 --pretty=format:%h)
gitversion=$(git log --pretty=format:%h | wc -l)
if [ -z "$1" ]; then
echo 'No directory supplied' >&2
exit 1
fi
project=$(echo "$1" | sed 's:/*$::' | sed 's/^.*\///')
path=$1
now=$(date)
nowformat=$(date +"%y-%m-%d")
mkdir -p $project.$nowformat
# start logging
(
echo "------------------------------------------------------------------"
echo "script: https://github.com/chiras/metabarcoding_pipeline"
echo "version: $githash, revision $gitversion"
echo "------------------------------------------------------------------"
echo "project: $project"
echo "project path: $path"
echo "results output path: $project.$nowformat"
echo "script log file path: $project.$nowformat/script_$nowformat.log"
echo "------------------------------------------------------------------"
echo "-- moving in data dir"
cd $project
echo "-- loading config"
source config.txt
echo "-- creating directories"
mkdir -p logs
mkdir -p raw
mkdir -p tmp
if [ $classificationOnly -ne 1 ]
then
if [ $skip_preprocessing -ne 1 ]
then
echo "-- decompressing raw files"
#extracting files
find . -name '*.gz' -print0 | xargs -0 -I {} -P $threads gunzip {}
# Define the header of the log file
echo "Sample ID,Total Reads,Merged Reads,Filtered Reads,Truncated Reads, Selected Strategy,Selected File,Selected Reads" > logs/_consolidated_log.csv
# Loop through all files for merging
echo "-- Starting merging and filtering --"
for f in *_R1_*.fastq; do
# Initialize variables for this sample
r=$(sed -e "s/_R1_/_R2_/" <<< "$f")
s=$(echo $f | sed "s/_L001.*//g")
total=$(grep "^@" $f | wc -l)
echo "===================================="
echo "Processing sample $s"
echo "(F: $f R: $r)"
echo "===================================="
# Stripping last bases for quality
$vsearch --fastq_filter $f \
--fastq_trunclen_keep $cutend_fw \
--fastq_truncee 3 \
--fastqout $f.sl.fq \
--threads $threads 2> logs/vsearch.rtf.$s.log
$vsearch --fastq_filter $r \
--fastq_trunclen_keep $cutend_rv \
--fastq_truncee 3 \
--fastqout $r.sl.fq \
--threads $threads 2> logs/vsearch.rtf.$s.log
# Merging reads
$vsearch --fastq_mergepairs $f.sl.fq \
--reverse $r.sl.fq \
$mergeoptions \
--fastq_minovlen $merge_minovlen \
--fastq_maxdiffs $merge_maxdiffs \
--fastqout $s.merge.fq \
--fastq_eeout \
--relabel R1+2-${s}_ \
--threads $threads 2> logs/vsearch.m.$s.log
merged_reads=$(grep "Merged" logs/vsearch.m.$s.log )
echo "$s : $merged_reads" | tee -a logs/_merging.log
merged_reads=$(echo $merged_reads | grep -o '[0-9]*' | head -n 1)
# Quality filtering on merged reads
$vsearch --fastq_filter $s.merge.fq \
--fastq_stripleft $stripleft \
--fastq_stripright $stripright \
--fastq_maxee $fastq_maxee \
--fastq_minlen $fastq_minlen \
--fastq_maxlen $fastq_maxlen \
--fastq_maxns $fastq_maxns \
--fastaout $s.mergefiltered.fa \
--fasta_width 0 \
--threads $threads 2> logs/vsearch.mf.$s.log
filtered_reads=$(grep "sequences kept" logs/vsearch.mf.$s.log )
echo "$s : $filtered_reads" | tee -a logs/_filter.log
filtered_reads=$(echo $filtered_reads | grep -o '[0-9]*' | head -n 1)
# Truncation filtering
$vsearch --fastq_truncee $fastq_truncee \
--fastq_filter $f \
--fastq_minlen $fastq_minlen \
--fastaout $s.trunc.fa \
--relabel R1-${s}_ \
--threads $threads 2> logs/vsearch.tf.$s.log
trunc_reads=$(grep "sequences kept" logs/vsearch.tf.$s.log )
echo "$s : $trunc_reads" | tee -a logs/_truncfilter.log
trunc_reads=$(echo $trunc_reads | grep -o '[0-9]*' | head -n 1)
# Apply conditions for selecting the best file
selected_file=""
if [[ $filtered_reads -gt 10000 ]]; then
selected_file="$s.mergefiltered.fa"
selected_reads=$filtered_reads
not_selected_reads=$trunc_reads
strategy="MERGE & Filter"
elif [[ $filtered_reads -gt 5000 && $trunc_reads -gt 10000 ]]; then
selected_file="$s.trunc.fa"
selected_reads=$trunc_reads
not_selected_reads=$filtered_reads
strategy="TRUNC & Filter"
elif [[ $filtered_reads -gt 5000 && $trunc_reads -lt 10000 ]]; then
selected_file="$s.mergefiltered.fa"
selected_reads=$filtered_reads
not_selected_reads=$trunc_reads
strategy="MERGE & Filter"
elif [[ $filtered_reads -lt 5000 && $trunc_reads -gt 5000 ]]; then
selected_file="$s.trunc.fa"
selected_reads=$trunc_reads
not_selected_reads=$filtered_reads
strategy="TRUNC & Filter"
else
if [[ $trunc_reads -lt $(( 3 * filtered_reads / 2 )) ]]; then
selected_file="$s.mergefiltered.fa"
selected_reads=$filtered_reads
not_selected_reads=$trunc_reads
strategy="MERGE & Filter (Low)"
else
selected_file="$s.trunc.fa"
selected_reads=$trunc_reads
not_selected_reads=$filtered_reads
strategy="TRUNC & Filter (Low)"
fi
fi
# Output the selected file and copy it as final selection
cp "$selected_file" "$s.selection.fa"
# Log results to consolidated CSV file
echo "$s,$total,$merged_reads,$filtered_reads,$trunc_reads,$strategy,$selected_file,$selected_reads" >> logs/_consolidated_log.csv
# Display selection summary
echo "Selected $strategy: $selected_file ($selected_reads reads)"
echo ""
done
cat *selection.fa > all.merge.fasta
#cat *trunc.fa > all.trunc.fasta
if [ $use_fw_only -eq 1 ]
then
cp all.merge.fasta all.merge.fasta.bak
cp all.trunc.fasta all.merge.fasta
echo " "
echo "!!! INFO: using forward reads only"
echo " "
fi #end useFWonly
# cleanup
echo "-- cleanup"
mv *.fastq raw/
mv *.fq tmp/
mv *.fa tmp/
echo "-- removing primer sequences"
python ../_resources/python/remove_primers_2.py --input all.merge.fasta --output all.merge.fasta.noprimer.fa --marker $marker
fi #end skippp
echo " "
echo "===================================="
echo "ASV generation and mapping"
echo "===================================="
echo "-- derep"
$vsearch --derep_fulllength all.merge.fasta.noprimer.fa \
--minuniquesize 2 \
--sizein \
--sizeout \
--fasta_width 0 \
--uc all.merge.derep.uc \
--output all.merge.derep.fa 2> logs/_derep.log
echo "-- denoise"
$vsearch --cluster_unoise all.merge.derep.fa \
--sizein --sizeout \
--relabel ASV \
--unoise_alpha $unoisealpha \
--centroids zotus.merge_chim.fa \
--threads $threads 2> logs/_unoise.log
grep "Clusters" logs/_unoise.log
grep "Singletons" logs/_unoise.log
echo "-- sorting"
$vsearch --sortbysize zotus.merge_chim.fa \
--output zotus.merge_sorted.fa \
--threads $threads 2> logs/_sort.log
echo "-- chimera removal"
$vsearch --uchime3_denovo zotus.merge_sorted.fa \
--abskew 16 \
--nonchimeras zotus.merge.fa \
--threads $threads 2> logs/_uchime.log
grep "Found" logs/_uchime.log
if [[ $postcluster -gt 0 ]]; then
mv zotus.merge.fa zotus.merge.tmp.fa
echo "-- postclustering of ASVs at $postcluster% identity"
$vsearch --cluster_fast zotus.merge.tmp.fa \
--centroids zotus.merge.fa \
--sizein --sizeout --sizeorder \
--threads $threads \
--id 0.$postcluster 2> logs/_postcluster.log
clusterin=$(grep -c ">" zotus.merge.tmp.fa)
clusterout=$(grep -c ">" zotus.merge.fa)
echo "$clusterin ASVs post-clustered to $clusterout pASVs"
fi
### create community table
echo "-- add barcodelabel"
#cat all.merge.fasta.noprimer.fa | sed "s/^>R1+2-\(.*\)\_\([0-9]*\);/>R1+2-\1_\2;barcodelabel=\1;/g" | sed "s/^>R1-\([a-zA-Z0-9-]*\)\_\([0-9]*\)/>R1-\1_\2;barcodelabel=\1;/g" > all.merge.bc.fasta
python ../_resources/python/add_barcodelabel.py -i all.merge.fasta.noprimer.fa -o all.merge.bc.fasta
echo "-- map data against ASVs"
$vsearch --usearch_global all.merge.bc.fasta --db zotus.merge.fa --strand plus --id 0.97 --uc map.merge.uc --otutabout asv.tab-csv --sizeout --threads $threads 2> logs/_mapping.log
grep "Matching" logs/_mapping.log
echo "-- create output folder $project.$nowformat; add first files"
# copy final files into folde
mkdir -p ../$project.$nowformat
cp config.txt ../$project.$nowformat/config.txt
cp zotus.merge.fa ../$project.$nowformat/asvs.merge.fa
# cp asv_table.merge.txt ../$project.$nowformat/asv_table.merge.txt
cp asv.tab-csv ../$project.$nowformat/asv_table.merge.txt
# prepare final project information file
echo "name;id;own;year;marker;description;participants;doi;repository;accession;ignore" > ../$project.$nowformat/project.csv
echo "$project;$project;1;$nowformat;$marker;;;;;;" >> ../$project.$nowformat/project.csv
# prepare final sample information file
echo "project;name;host;collectionDate;location;country;bioregion;latitude;longitude;tissue;treatment;sampletype;notes" > ../$project.$nowformat/samples.csv
head -n 1 asv.tab-csv | sed -e "s/#OTU ID[[:space:]]//g" | tr "\t" "\n" | sed "s/^/$project;/" >> ../$project.$nowformat/samples.csv
fi #end classificationOnly
##### create taxonomy
echo " "
echo "===================================="
echo "Taxonomic classification"
echo "===================================="
# $vsearch
# old script version: keep to troubleshoot
# refDBs=($(grep "refdb" config.txt | cut -f2 -d"=" | sed 's/\"//g'))
# hieDBs=($(grep "hiedb" config.txt | cut -f2 -d"=" | sed 's/\"//g'))
threshold=$tax_threshold
echo "Classification threshold: $threshold"
# switch here for marker
rm taxonomy.vsearch
if [ "$marker" = "ITS2" ]
then
echo ",kingdom,phylum,order,family,genus,species" > taxonomy.vsearch
echo ",kingdom,phylum,order,family,genus,species" > taxonomy.blast
fi
if [ "$marker" = "COI-5P" ]
then
echo ",kingdom,phylum,class,order,family,genus,species" > taxonomy.vsearch
echo ",kingdom,phylum,class,order,family,subfamily,genus,species, subspecies" > taxonomy.blast
fi
if [ "$marker" = "16S" ]
then
echo ",kingdom,phylum,order,family,genus" > taxonomy.vsearch
echo ",kingdom,phylum,order,family,genus" > taxonomy.blast
fi
countdb=0
cp zotus.merge.fa zotus.direct.$countdb.uc.nohit.fasta
prevDB=$countdb
for db in "${refDBs[@]}"
do :
countdb=$((countdb+1))
echo "-- Direct vsearch Classification level: $countdb";
echo "DB: $db";
$vsearch --usearch_global zotus.direct.$prevDB.uc.nohit.fasta \
--db $db \
--id 0.$threshold \
--uc zotus.direct.$countdb.uc \
--fastapairs zotus.direct.$countdb.fasta \
--strand both \
--threads $threads 2> logs/_direct.$countdb.log
grep "^N[[:space:]]" zotus.direct.$countdb.uc | cut -f 9 > zotus.direct.$countdb.uc.nohit
total_xhits=$(wc -l zotus.direct.$countdb.uc | grep -o '[0-9]*' | head -n 1)
counted_hits=$(grep -c -v "^N[[:space:]]" zotus.direct.$countdb.uc )
counted_nohits=$(grep -c "^N[[:space:]]" zotus.direct.$countdb.uc )
listed_nohits=$(wc -l zotus.direct.$countdb.uc.nohit | grep -o '[0-9]*' | head -n 1)
echo "Total: $total_xhits, thereof $counted_hits hits and $counted_nohits no hits"
echo "Listed for next iteration: $listed_nohits"
python ../_resources/python/subset_fasta.py -i zotus.merge.fa \
-l zotus.direct.$countdb.uc.nohit \
-o zotus.direct.$countdb.uc.nohit.fasta
#$seqfilter zotus.merge.fa --ids zotus.direct.$countdb.uc.nohit --out zotus.direct.$countdb.uc.nohit.fasta
filtered_nohits=$(grep -c ">" zotus.direct.$countdb.uc.nohit.fasta)
echo "Filtered: $filtered_nohits"
cut -f 9,10 zotus.direct.$countdb.uc | grep -v "*" | sed "s/[A-Za-z0-9_-]*;tax=//" >> taxonomy.vsearch
prevDB=$countdb
done
echo "-- Hierarchical vsearch classification";
echo "DB: $hieDBs";
$vsearch --sintax zotus.direct.$countdb.uc.nohit.fasta \
--db $hieDBs \
--tabbedout zotus.uc.merge.nohit.sintax \
--strand plus \
--sintax_cutoff $sintax_cutoff \
--threads $threads 2> logs/_sintax.log
python ../_resources/python/sintax_overview.py zotus.uc.merge.nohit.sintax
cut -f1,4 zotus.uc.merge.nohit.sintax | sed -E -e "s/\_[0-9]+//g" -e "s/,s:.*$//" >> taxonomy.vsearch
# v3 idea [TODO]: phylo + spc estimation on sintax results
echo "-- polishing and copying output files"
python3 ../_resources/python/fix_output_files.py --tax taxonomy.vsearch --asv asv.tab-csv
cp taxonomy.vsearch ../$project.$nowformat/taxonomy.vsearch
cp asv.tab-csv ../$project.$nowformat/asv_table.merge.txt
echo "-- create R script"
# CREATE R SCRIPT
cat ../_resources/R_template_header.R > ../$project.$nowformat/R_$project.v0.R
echo "# Created: $now" >>../$project.$nowformat/R_$project.v0.R
echo "# Project: $project" >>../$project.$nowformat/R_$project.v0.R
echo "# Marker: $marker" >>../$project.$nowformat/R_$project.v0.R
echo "# For: $datasupplier" >>../$project.$nowformat/R_$project.v0.R
cat ../_resources/R_template_libraries.R >> ../$project.$nowformat/R_$project.v0.R
echo "\n\n# Setting working directory (check path)" >>../$project.$nowformat/R_$project.v0.R
echo "setwd('$(pwd)/../$project.$nowformat')" >>../$project.$nowformat/R_$project.v0.R
echo "\n\n# Custom functions inclusion" >>../$project.$nowformat/R_$project.v0.R
echo "marker=\"$marker\"" >>../$project.$nowformat/R_$project.v0.R
echo "source('$(pwd)/../_resources/metabarcoding_tools_0-1a.R')" >>../$project.$nowformat/R_$project.v0.R
cat ../_resources/R_template_ITS2.R >> ../$project.$nowformat/R_$project.v0.R
cp ../_resources/metabarcoding_tools_0-1a.R ../$project.$nowformat/
mkdir -p ../$project.$nowformat/plots
if [ $phylogeny -eq 1 ]
then
$mafft zotus.merge.fa > zotus.merge.mafft.fa
sed "s/;size/_size/g" zotus.merge.mafft.fa > zotus.merge.mafft2.fa
$raxml --msa zotus.merge.mafft2.fa --model GTR+G --msa-format FASTA -threads $threads --prefix zotus
sed "s/;size/_size/g" ./taxonomy.vsearch > ../$project.$nowformat/taxonomy.vsearch
sed "s/;size/_size/g" asv.tab-csv > ../$project.$nowformat/asv_table.merge.txt
cp ./zotus.merge.mafft2.fa.raxml.bestTree ../$project.$nowformat/asvs.tre
fi
$vsearch -v > logs/software.version
cd ..
# cleanup if wanted
if [ $compressionCleanup -eq 1 ]
then
echo "-- compressing large files"
sh _compression_cleanup_1.sh $project
else
echo "NO compression and cleanup applied. Consider calling:"
echo " bash _compression_cleanup_1.sh $project"
fi
) | tee $project.$nowformat/script_$nowformat.log