-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpl_concat.sh
71 lines (52 loc) · 2.9 KB
/
pl_concat.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#!/bin/bash
help(){
echo -e "\n========= Usage =========\n"
echo -e "-d (conserved_coregenes_dir) Provide input directory path where conserved coregenes are located\n"
echo -e "-s (suffix) Provide suffix of the files\n"
echo -e "-n (samplenames) Provide file containing samplenames (i.e. names of the organisms)\n\n\tExample:\n \n\tfasta file1: \
\n\t>organism1.1.1\n\tMPGIAIIGAQWGDEGKGKIVDFLAPEAKFVARYQGGANAGHTVNAKGKTFKLNLLPSGVL \
\n\t>organism2.2.1\n\tHEQTVSVLGDGMVIDPEKFLAERQNLLDGGLTPELRISERAHLVLPHHKYVDGRKDFVGL\n \
\n\tfasta file2: \
\n\t>organism1.2.1\n\tMPGIAIIGAQWGDEGKGKITDFLAPQANYVVRYQGGANAGHTVTAKGQTFKLNLLPSGVLL \
\n\t>organism2.2.1\n\tTGRGIGPAYADRARRVGIRFGDLSDLSVLRERVERLLEAKPNSTAAAGWGSVSDALGYLLG\n \
\n\tsamplenames: \
\n\torganism1 \
\n\torganism2\n "
echo -e "-f (reference_file) Provide name of one of the files in conserved coregenes folder\n"
echo -e "\nbash pl_concat.sh -d /Users/priyalakra/Desktop/dwdpractice/conserved_coregenes -s fasta -f 89706_murrayi_DSM11303_peg.858.faa_aligned.fasta -o /Users/priyalakra/Desktop\n"
exit 1
}
while getopts "d:s:n:f:o::h" opt; do
case "$opt" in
d ) conserved_coregenes_dir="$OPTARG" ;;
s ) suffix="$OPTARG" ;;
n ) samplenames="$OPTARG" ;;
f ) reference_file="$OPTARG" ;;
o ) outfold="$OPTARG" ;;
h ) help; exit 1 ;;
esac
done
## change directory to folder containing conserved genes and concatenate files
mkdir -p "$conserved_coregenes_dir"/tempfiles && \
cd "$conserved_coregenes_dir" && \
cat *.$suffix > "$conserved_coregenes_dir"/tempfiles/final.fa && \
cd "$conserved_coregenes_dir"/tempfiles && \
echo "dir changed to "$PWD" "
## remove unnecessary line breaks from the fasta file
awk '/^>/ { print (NR==1 ? "" : RS) $0; next } { printf "%s", $0 } END { printf RS }' final.fa > final.ed.fa
mkdir -p out1/out2/out3/out4
## extract names and individual fasta # needs better optimization
if [[ -z "$samplenames" ]]; then
grep ">" ../"$reference_file" | cut -f1 -d " " | cut -f1 -d "." | cut -f2 -d ">" > samplenames
while read line; do ggrep -A1 --no-group-separator "$line" final.ed.fa > out1/"$line".fasta; done < samplenames
fi
if [[ "$samplenames" ]]; then
while read line; do ggrep -A1 --no-group-separator "$line" final.ed.fa > out1/"$line".fasta; done < "$samplenames"
fi
## remove all header lines and paste
cd out1 && for i in *.fasta; do grep -v ">" $i > out2/$i.edited; done
cd out2 && for i in *.edited; do paste -d "-" -s $i > out3/$i.edit; done
## re add > header and then concatenate
cd out3 && for i in *.edit; do names=(${i//./ }); awk -v n="$names" 'NR==1{print ">"n"\n"$0}' $i > out4/$i.final.fasta; done
cd out4 && cat *.fasta > "$outfold"/supermatrix.fasta
echo -e "\n\tSuper matrix fasta file is located in "$outfold" folder. Use this file for further analysis\n"