Skip to content

Commit 1500c66

Browse files
committed
[DEV] hicpro2juicebox now support old allValidPairs format
1 parent 8e85a7f commit 1500c66

File tree

4 files changed

+92
-16
lines changed

4 files changed

+92
-16
lines changed

NEWS

+4
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@ CHANGES IN VERSION 2.10.0
33

44
NEW FEATURES
55

6+
o hicpro2juicebox.sh utility now supports alof HiC-Pro format (< 2.7.5)
7+
68
SIGNIFICANT USER-VISIBLE CHANGES
79

810
o udpate R scripts to be compatible with the lastest ggplot2 version (>2.2.1)
@@ -11,6 +13,8 @@ SIGNIFICANT USER-VISIBLE CHANGES
1113

1214
BUG FIXES
1315

16+
o Fix bug to avoid floating values in valid pair positions
17+
1418
o Fix bug in order of samtools sort parameter in bowtie_combine.sh
1519

1620
o Fix bug in output option of makeViewpoints

bin/utils/extract_snps.py

+69-14
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ def usage():
2828
print "-a/--alt <sample name for alternative allele>"
2929
print "[-r/--ref] <sample name for reference allele. Default is the same as in the initial file>"
3030
print "[-f/--filt] <Filtering level. 0 - no filtering. 1 - based on FI information. 2 - based on FILTER information. Default is 2>"
31+
print "[-x/--exclude] <Exclude potential contaminants from the list of SNPs which are equal to ALT genotype, i.e REF is expected to be contaminated>"
3132
print "[-v/--verbose] <Verbose>"
3233
print "[-h/--help] <Help>"
3334
return
@@ -37,11 +38,12 @@ def get_args():
3738
try:
3839
opts, args = getopt.getopt(
3940
sys.argv[1:],
40-
"i:r:a:f:vh",
41+
"i:r:a:f:x:vh",
4142
["vcf=",
4243
"alt=",
4344
"ref=",
4445
"filt=",
46+
"excl=",
4547
"verbose", "help"])
4648
except getopt.GetoptError:
4749
usage()
@@ -50,7 +52,7 @@ def get_args():
5052

5153
## 0/0, 1/1, A, T, 1
5254
## 1/1, 2/2, A, T, C, 1
53-
def get_filter_snp_gt(gref, galt, ref, alt):
55+
def get_filter_snp_gt(gref, galt, ref, alt, conta):
5456

5557
#print gref
5658
#print galt
@@ -76,14 +78,21 @@ def get_filter_snp_gt(gref, galt, ref, alt):
7678
## Non informative SNPs
7779
if ref_snp == alt_snp:
7880
return -3
79-
else:
80-
## check alt and ref alleles
81-
alleles = []
82-
alleles.append(ref)
83-
for a in alt.split(','):
84-
alleles.append(a)
81+
## Remove SNPs that are different from the reference and therefore likely to be wrongly assigned to the alternative
82+
elif len(conta)>0:
83+
for i in range(len(conta)):
84+
conta_geno = re.split('/|\|', conta[i])
85+
## if conta = alt remove snps
86+
if conta_geno[0] == alt_geno[0] or conta_geno[1] == alt_geno[1]:
87+
return -4
88+
89+
## check alt and ref alleles
90+
alleles = []
91+
alleles.append(ref)
92+
for a in alt.split(','):
93+
alleles.append(a)
8594

86-
return [alleles[int(ref_snp)], alleles[int(alt_snp)]]
95+
return [alleles[int(ref_snp)], alleles[int(alt_snp)]]
8796

8897

8998
if __name__ == "__main__":
@@ -93,6 +102,7 @@ def get_filter_snp_gt(gref, galt, ref, alt):
93102
vcfFile = None
94103
refSample = None
95104
altSample = None
105+
exclusion = None
96106
filt_qual=2
97107
verbose = False
98108

@@ -112,6 +122,8 @@ def get_filter_snp_gt(gref, galt, ref, alt):
112122
altSample = arg
113123
elif opt in ("-f", "--filt"):
114124
filt_qual = int(arg)
125+
elif opt in ("-x", "--exclude"):
126+
exclusion = arg
115127
elif opt in ("-v", "--verbose"):
116128
verbose = True
117129
else:
@@ -133,16 +145,20 @@ def get_filter_snp_gt(gref, galt, ref, alt):
133145
samples = []
134146
altidx = -1
135147
refidx = -1
136-
148+
contaidx=[]
149+
137150
var_counter = 0
138151
snp_counter = 0
139152
hetero_counter = 0
140153
badqual_counter = 0
141154
undefined_counter = 0
142155
nonspe_counter = 0
156+
conta_counter = 0
143157

144158
for line in vcf_handle:
145159
line = line.rstrip()
160+
#print >> sys.stderr, line
161+
146162
## for now we don't care about the header
147163
if line.startswith('##'):
148164
if refSample is not None and line.startswith("##reference="):
@@ -158,7 +174,29 @@ def get_filter_snp_gt(gref, galt, ref, alt):
158174
refidx = i
159175
elif samples[i] == altSample:
160176
altidx = i
161-
177+
elif exclusion is not None:
178+
## conta idx
179+
exs=exclusion.split(",")
180+
for i in range(len(exs)):
181+
ct = exs[i]
182+
if samples[i] == ct:
183+
contaidx.append(i)
184+
if verbose:
185+
print >> sys.stderr, "## Potential Contaminant(s) = " + ct
186+
187+
188+
## Check if Bl6 is in the conta list
189+
if exclusion is not None:
190+
exs=exclusion.split(",")
191+
for i in range(len(exs)):
192+
ct = exs[i]
193+
if ct == "REF":
194+
contaidx.append(-1)
195+
if verbose:
196+
print >> sys.stderr, "## Potential Contaminant(s) = REF"
197+
198+
199+
162200
## Check input parameters
163201
if refSample != None and refidx == -1:
164202
print >> sys.stderr, "Error : REF sample not found"
@@ -185,7 +223,10 @@ def get_filter_snp_gt(gref, galt, ref, alt):
185223

186224
fields = line.split('\t',9)
187225
var_counter+=1
188-
226+
227+
## init list of contaminant
228+
contg=[]
229+
189230
## check chromosomes name
190231
if re.compile('^chr').match(fields[0]):
191232
chrom=fields[0]
@@ -214,24 +255,37 @@ def get_filter_snp_gt(gref, galt, ref, alt):
214255
else:
215256
refg = ["0/0"]
216257
reffi = "1"
258+
259+
if len(contaidx) > 0:
260+
for i in range(len(contaidx)):
261+
if contaidx[i] == -1:
262+
contg.append("0/0")
263+
else:
264+
cg=genotypes[contaidx[i]].split(':')
265+
cfi = cg[len(cg)-1]
266+
if filt_qual != 1 or filt_qual == 1 and cfi == str(1):
267+
contg.append(cg[0])
217268

218269
## Filter on FI field
219270
if filt_qual != 1 or (filt_qual == 1 and reffi == str(1) and altfi == str(1)):
220271
#print "---------"
221272
#print refg
222273
#print altg
223274
#print fields
224-
geno = get_filter_snp_gt(refg[0], altg[0], fields[3], fields[4])
275+
geno = get_filter_snp_gt(refg[0], altg[0], fields[3], fields[4], contg)
276+
225277
if geno == -1:
226278
undefined_counter += 1
227279
elif geno == -2:
228280
hetero_counter += 1
229281
elif geno == -3:
230282
nonspe_counter += 1
283+
elif geno == -4:
284+
conta_counter += 1
231285
else:
232286
snp_counter += 1
233287
#altg[0]="1/1"
234-
#print chrom + "\t" + fields[1] + "\t" + fields[2] + "\t" + geno[0] + "\t" + geno[1] + "\t" + fields[5] + "\t" + fields[6] + "\t" + fields[7] + "\t" + fields[8] + "\t" + ":".join(altg)
288+
##print chrom + "\t" + fields[1] + "\t" + fields[2] + "\t" + geno[0] + "\t" + geno[1] + "\t" + fields[5] + "\t" + fields[6] + "\t" + fields[7] + "\t" + fields[8] + "\t" + ":".join(altg)
235289
print chrom + "\t" + fields[1] + "\t" + fields[2] + "\t" + geno[0] + "\t" + geno[1] + "\t" + fields[5] + "\t" + fields[6] + "\t" + fields[7] + "\t" + "GT" + "\t" + "0/1"
236290

237291
else:
@@ -252,6 +306,7 @@ def get_filter_snp_gt(gref, galt, ref, alt):
252306
print >> sys.stderr, "## Number of heterozygous SNPs =", hetero_counter
253307
print >> sys.stderr, "## Number of undefined genotype SNPs =", undefined_counter
254308
print >> sys.stderr, "## Number of bad quality SNPs =", badqual_counter
309+
print >> sys.stderr, "## Number of potential contaminant SNPs =", conta_counter
255310

256311

257312
vcf_handle.close()

bin/utils/hicpro2juicebox.sh

+18-1
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,23 @@ if [[ ! -e $JUICEBOXJAR ]]; then
105105
exit 1
106106
fi
107107

108+
## Deal with old format
109+
nbfields=$(head -1 $VALIDPAIRS | awk '{print NF}')
110+
111+
if [[ $nbfields == "12" ]]; then
112+
echo -e "HiC-Pro format > 2.7.5 detected ..."
113+
114+
elif [[ $nbfields == "8" ]]; then
115+
echo -e "HiC-Pro format < 2.7.6 detected ..."
116+
echo -e "Adjusting AllValidPairs format ..."
117+
awk '{OFS="\t"; print $0,0,1,42,42}' $VALIDPAIRS > ${TEMP}/$$_format_AllValidPairs
118+
VALIDPAIRS=${TEMP}/$$_format_AllValidPairs
119+
120+
else
121+
echo -e "Error : unknown format - $nbfields detected, whereas 8 (< v2.7.6) or 12 (> v2.7.5) fields are expected !"
122+
exit 1
123+
fi
124+
108125
echo "Generating Juicebox input files ..."
109126

110127
if [[ ! -z $RESFRAG ]]; then
@@ -117,7 +134,7 @@ if [[ ! -z $RESFRAG ]]; then
117134

118135
## The “pre” command needs the contact map to be sorted by chromosome and grouped so that all reads for one chromosome (let’s say, chr1) appear in the same column.
119136
## Also, chromosomes should not have the ‘chr” substring and the strand is coded as 0 for positive and anything else for negative (in practice, 1).
120-
awk '{$4=$4!="+"; $7=$7!="+"; n1=split($9, frag1, "_"); n2=split($10, frag2, "_"); } $2<=$5{print $1, $4, $2, $3, frag1[n1], $7, $5, $6, frag2[n2], $11, $12 }$5<$2{ print $1, $7, $5, $6, frag2[3], $4, $2, $3, frag1[3], $12, $11}' $VALIDPAIRS | sort -k3,3d -k7,7d -S 90 > ${TEMP}/$$_allValidPairs.pre_juicebox_sorted
137+
awk '{$4=$4!="+"; $7=$7!="+"; n1=split($9, frag1, "_"); n2=split($10, frag2, "_"); } $2<=$5{print $1, $4, $2, $3, frag1[n1], $7, $5, $6, frag2[n2], $11, $12 }$5<$2{ print $1, $7, $5, $6, frag2[n2], $4, $2, $3, frag1[n1], $12, $11}' $VALIDPAIRS | sort -k3,3d -k7,7d -S 90 > ${TEMP}/$$_allValidPairs.pre_juicebox_sorted
121138
else
122139
awk '{$4=$4!="+"; $7=$7!="+"} $2<=$5{print $1, $4, $2, $3, 0, $7, $5, $6, 1, $11, $12 }$5<$2{ print $1, $7, $5, $6, 0, $4, $2, $3, 1, $12, $11 }' $VALIDPAIRS | sort -k3,3d -k7,7d -S 90 > ${TEMP}/$$_allValidPairs.pre_juicebox_sorted
123140
fi

scripts/mapped_2hic_fragments.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,7 @@ def get_read_pos(read):
136136
read : list
137137
list of aligned reads
138138
"""
139-
pos = read.pos + read.alen/2
139+
pos = read.pos + int(read.alen/2)
140140

141141
return pos
142142

0 commit comments

Comments
 (0)