-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathsnpSniffer_Summarize.sh
executable file
·158 lines (128 loc) · 4.74 KB
/
snpSniffer_Summarize.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
#!/usr/bin/env bash
# This code will replace the old "Run_SnpSniffer.sh" script
# Capture the current script path to dynamically link default input database.ini file
SCRIPT_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/$(basename "${BASH_SOURCE[0]}")"
SCRIPT_DIR=`dirname ${SCRIPT_PATH}`
## Usage help message
usage()
{
# `cat << EOF` This means that cat should stop reading when EOF is detected
cat << EOF
Usage: $0 [ options ]
-h Display Help
Required Options
-d Input Database File [databaseV5_hg38_ucsc.ini]
-f Filter Library Level Results (Yes/No) [Yes]
NOTES: Default database supports human GRCh38/hg38 coordinates WITHOUT chr prefixes
EOF
# EOF is found above and hence cat command stops reading. This is equivalent to echo but much neater when printing out.
exit 2
}
## Assign default values
DATABASE=${SCRIPT_DIR}/grch38_hg38_ucsc_contigs/databaseV5_hg38_ucsc.ini
FILTER=Yes
## Capture and assign variables from inputs
while getopts 'd:f:?h' flag
do
case ${flag} in
d) DATABASE=${OPTARG};;
f) FILTER=${OPTARG};;
h|?) usage;;
esac
done
###############################################
## Define Paths to Required scripts based on location of initial script
###############################################
SNP_SNIFFER_JAR=${SCRIPT_DIR}/snpSniffer.jar
SNP_SNIFFER_GRAPH=${SCRIPT_DIR}/snpSniffer_Summarize.R
###############################################
## Collect available genotype results and add to fresh database
###############################################
## Print the help result to the terminal
echo
echo
echo
java -jar ${SNP_SNIFFER_JAR} -help
echo
echo
echo
echo "---------------------------------------"
echo
echo "Creating snpSniffer Summary of Current Folder"
# Create an empty version of the database
cat ${DATABASE} > SnpSniffer_DB.ini
# Find all the pre-calculated genotype results (search for all expected extensions)
# Drop the library level results for simplified viewing
if [ $FILTER == "Yes" ]
then
find . -type f -name "*flt.vcf" ! -name "*[A-Z][0-9][0-9][0-9][0-9][0-9]*" > Temp_SnpSniffer_Genotype_Paths.txt
find . -type f -name "*snpsniffer.vcf" ! -name "*[A-Z][0-9][0-9][0-9][0-9][0-9]*" >> Temp_SnpSniffer_Genotype_Paths.txt
find . -type f -name "*snpSniffer.vcf" ! -name "*[A-Z][0-9][0-9][0-9][0-9][0-9]*" >> Temp_SnpSniffer_Genotype_Paths.txt
elif [ $FILTER == "No" ]
then
find . -type f -name "*flt.vcf" > Temp_SnpSniffer_Genotype_Paths.txt
find . -type f -name "*snpsniffer.vcf" >> Temp_SnpSniffer_Genotype_Paths.txt
find . -type f -name "*snpSniffer.vcf" >> Temp_SnpSniffer_Genotype_Paths.txt
else
echo
echo "------------------------"
echo ERROR incorrect value provided to option -f, only Yes or No are expected
echo
echo
exit 1
fi
# Add genotypes to the database
for line in `cat Temp_SnpSniffer_Genotype_Paths.txt`
do
java -jar ${SNP_SNIFFER_JAR} -add ${line} SnpSniffer_DB.ini
done
# Cleanup the temp file
rm Temp_SnpSniffer_Genotype_Paths.txt
###############################################
## Generate All Pairwise Comparisons
###############################################
# When -expected or -NotExpect is queried a full comparison is produced to a file called snpSniffer_output.txt, that is used for all summaries
java -jar ${SNP_SNIFFER_JAR} -expected _ 2 SnpSniffer_DB.ini > Temp_Expected_Results.txt
# Remove the unwanted file
rm Temp_Expected_Results.txt
# Preprocess the generic full comparison output
## Replace unwanted features with tab-separtor
## Create sorted list of line by line comparisons and drop duplicates
sed 's/ & /\'$'\t''/g' snpSniffer_output.txt \
| \
sed 's/ count=/\'$'\t''/g' \
| \
sed 's/ match=/\'$'\t''/g' \
| \
sed 's/ ratio=/\'$'\t''/g' \
| \
sed 's/.bwa.bam.snpSniffer//g' \
| \
sed 's/.star.bam.snpSniffer//g' \
| \
awk '{print ($1 > $2) ? $0"\t"$1"-"$2 : $0"\t"$2"-"$1}' \
| \
awk '!x[$6]++'> SnpSniffer_AllPairs_Results.txt
# Remove preliminary output file
rm snpSniffer_output.txt
###############################################
## Generate Heterozygous Genotype Summary
###############################################
# Extract het summary from database using snpSniffer.jar
java -jar ${SNP_SNIFFER_JAR} -het SnpSniffer_DB.ini > Temp1_HetRate_Results.txt
# Remove header lines
awk 'NR>1' Temp1_HetRate_Results.txt \
| \
sed 's/.bwa.bam.snpSniffer//g' \
| \
sed 's/.star.bam.snpSniffer//g' > Temp2_HetRate_Results.txt
###############################################
## Summarize and Graph with R (Tidyverse)
###############################################
Rscript --vanilla ${SNP_SNIFFER_GRAPH} \
--comp_file SnpSniffer_AllPairs_Results.txt \
--het_file Temp2_HetRate_Results.txt
# Remove unneeded files
rm SnpSniffer_AllPairs_Results.txt
rm Temp1_HetRate_Results.txt
rm Temp2_HetRate_Results.txt