Skip to content

Commit 67a4f37

Browse files
authored
Merge pull request #875 from pavanvidem/hic-new-features
add new little requested features
2 parents 879650f + c416e9a commit 67a4f37

13 files changed

+242
-50
lines changed

hicexplorer/chicAggregateStatistic.py

+41-19
Original file line numberDiff line numberDiff line change
@@ -63,20 +63,31 @@ def parse_arguments(args=None):
6363
return parser
6464

6565

66-
def filter_scores_target_list(pScoresDictionary, pTargetList=None, pTargetIntervalTree=None, pTargetFile=None):
66+
def filter_scores_target_list(pScoresDictionary, pTargetFType, pTargetPosDict, pTargetList=None, pTargetIntervalTree=None, pTargetFile=None):
6767

6868
accepted_scores = {}
6969
same_target_dict = {}
7070
target_regions_intervaltree = None
71-
if pTargetList is not None:
72-
71+
# newly added
72+
if pTargetFType == 'hdf5':
7373
# read hdf content for this specific combination
7474
targetFileHDF5Object = h5py.File(pTargetFile, 'r')
7575
target_object = targetFileHDF5Object['/'.join(pTargetList)]
7676
chromosome = target_object.get('chromosome')[()].decode("utf-8")
7777
start_list = list(target_object['start_list'][:])
7878
end_list = list(target_object['end_list'][:])
7979
targetFileHDF5Object.close()
80+
elif pTargetFType == 'bed4':
81+
chromosome = pTargetPosDict[pTargetList[-1]]['chromosome']
82+
start_list = pTargetPosDict[pTargetList[-1]]['start_list']
83+
end_list = pTargetPosDict[pTargetList[-1]]['end_list']
84+
elif pTargetFType == 'bed3':
85+
target_regions_intervaltree = pTargetIntervalTree
86+
else:
87+
log.error('No target list given.')
88+
raise Exception('No target list given.')
89+
90+
if pTargetList is not None:
8091
chromosome = [chromosome] * len(start_list)
8192

8293
target_regions = list(zip(chromosome, start_list, end_list))
@@ -85,12 +96,6 @@ def filter_scores_target_list(pScoresDictionary, pTargetList=None, pTargetInterv
8596

8697
hicmatrix = hm.hiCMatrix()
8798
target_regions_intervaltree = hicmatrix.intervalListToIntervalTree(target_regions)[0]
88-
elif pTargetIntervalTree is not None:
89-
target_regions_intervaltree = pTargetIntervalTree
90-
91-
else:
92-
log.error('No target list given.')
93-
raise Exception('No target list given.')
9499

95100
for key in pScoresDictionary:
96101
chromosome = pScoresDictionary[key][0]
@@ -193,12 +198,12 @@ def writeAggregateHDF(pOutFileName, pOutfileNamesList, pAcceptedScoresList, pArg
193198
aggregateFileH5Object.close()
194199

195200

196-
def run_target_list_compilation(pInteractionFilesList, pTargetList, pArgs, pViewpointObj, pQueue=None, pOneTarget=False):
201+
def run_target_list_compilation(pInteractionFilesList, pTargetList, pTargetFType, pTargetPosDict, pArgs, pViewpointObj, pQueue=None, pOneTarget=False):
197202
outfile_names_list = []
198203
accepted_scores_list = []
199204
target_regions_intervaltree = None
200205
try:
201-
if pOneTarget == True:
206+
if pTargetFType == 'bed3':
202207
try:
203208
target_regions = utilities.readBed(pTargetList)
204209
except Exception as exp:
@@ -211,14 +216,13 @@ def run_target_list_compilation(pInteractionFilesList, pTargetList, pArgs, pView
211216
outfile_names_list_intern = []
212217
accepted_scores_list_intern = []
213218
for sample in interactionFile:
214-
215219
interaction_data, interaction_file_data, _ = pViewpointObj.readInteractionFile(pArgs.interactionFile, sample)
216220
if pOneTarget == True:
217221
target_file = None
218222
else:
219223
target_file = pTargetList[i]
220224

221-
accepted_scores = filter_scores_target_list(interaction_file_data, pTargetList=target_file, pTargetIntervalTree=target_regions_intervaltree, pTargetFile=pArgs.targetFile)
225+
accepted_scores = filter_scores_target_list(interaction_file_data, pTargetFType, pTargetPosDict, pTargetList=target_file, pTargetIntervalTree=target_regions_intervaltree, pTargetFile=pArgs.targetFile)
222226

223227
outfile_names_list_intern.append(sample)
224228
accepted_scores_list_intern.append(accepted_scores)
@@ -238,7 +242,7 @@ def run_target_list_compilation(pInteractionFilesList, pTargetList, pArgs, pView
238242
return
239243

240244

241-
def call_multi_core(pInteractionFilesList, pTargetFileList, pFunctionName, pArgs, pViewpointObj):
245+
def call_multi_core(pInteractionFilesList, pTargetFileList, pTargetFType, pTargetPosDict, pFunctionName, pArgs, pViewpointObj):
242246
if len(pInteractionFilesList) < pArgs.threads:
243247
pArgs.threads = len(pInteractionFilesList)
244248
outfile_names_list = [None] * pArgs.threads
@@ -272,6 +276,8 @@ def call_multi_core(pInteractionFilesList, pTargetFileList, pFunctionName, pArgs
272276
process[i] = Process(target=pFunctionName, kwargs=dict(
273277
pInteractionFilesList=interactionFileListThread,
274278
pTargetList=targetFileListThread,
279+
pTargetFType=pTargetFType,
280+
pTargetPosDict=pTargetPosDict,
275281
pArgs=pArgs,
276282
pViewpointObj=pViewpointObj,
277283
pQueue=queue[i],
@@ -318,16 +324,32 @@ def main(args=None):
318324

319325
targetList = []
320326
present_genes = {}
327+
target_ftype = ''
328+
targetPosDict = None
321329
# read hdf file
322330
interactionFileHDF5Object = h5py.File(args.interactionFile, 'r')
323331
keys_interactionFile = list(interactionFileHDF5Object.keys())
324332

325333
if h5py.is_hdf5(args.targetFile):
326-
327334
targetDict, present_genes = viewpointObj.readTargetHDFFile(args.targetFile)
335+
target_ftype = 'hdf5'
328336

329337
else:
330-
targetList = [args.targetFile]
338+
with open(args.targetFile) as file:
339+
for line in file.readlines():
340+
if line.startswith('#'):
341+
continue
342+
_line = line.strip().split('\t')
343+
break
344+
if len(_line) == 4:
345+
log.info('Targets BED contains 4 columns, aggregating on column 4')
346+
target_ftype = 'bed4'
347+
present_genes, targetDict, targetPosDict = utilities.readTargetBed(args.targetFile)
348+
elif len(_line) == 3:
349+
targetList = [args.targetFile]
350+
target_ftype = 'bed3'
351+
else:
352+
log.error('BED of targets list must have 3 or 4 columns')
331353

332354
if len(keys_interactionFile) > 1:
333355

@@ -346,7 +368,7 @@ def main(args=None):
346368
geneList2 = sorted(list(matrix_obj2[chromosome2].keys()))
347369

348370
for gene1, gene2 in zip(geneList1, geneList2):
349-
if h5py.is_hdf5(args.targetFile):
371+
if target_ftype != 'bed3':
350372
if gene1 in present_genes[sample][sample2]:
351373
interactionDict[gene1] = [[sample, chromosome1, gene1], [sample2, chromosome2, gene2]]
352374
else:
@@ -356,7 +378,7 @@ def main(args=None):
356378

357379
interactionFileHDF5Object.close()
358380

359-
if h5py.is_hdf5(args.targetFile):
381+
if target_ftype != 'bed3':
360382
key_outer_matrix = present_genes.keys()
361383
for keys_outer in key_outer_matrix:
362384
keys_inner_matrix = present_genes[keys_outer].keys()
@@ -365,5 +387,5 @@ def main(args=None):
365387
interactionList.append(interactionDict[gene])
366388
targetList.append(targetDict[gene])
367389

368-
outfile_names_list, accepted_scores_list = call_multi_core(interactionList, targetList, run_target_list_compilation, args, viewpointObj)
390+
outfile_names_list, accepted_scores_list = call_multi_core(interactionList, targetList, target_ftype, targetPosDict, run_target_list_compilation, args, viewpointObj)
369391
writeAggregateHDF(args.outFileName, outfile_names_list, accepted_scores_list, args)

hicexplorer/hicCorrectMatrix.py

+11-1
Original file line numberDiff line numberDiff line change
@@ -235,6 +235,9 @@ def correct_subparser():
235235
'of chromosomes and/or translocations.',
236236
action='store_true')
237237

238+
parserOpt.add_argument('--filteredBed',
239+
help='Print bins filtered our by --filterThreshold to this file')
240+
238241
parserOpt.add_argument('--verbose',
239242
help='Print processing status.',
240243
action='store_true')
@@ -548,6 +551,7 @@ def filter_by_zscore(hic_ma, lower_threshold, upper_threshold, perchr=False):
548551
to avoid introducing bias due to different chromosome numbers
549552
550553
"""
554+
log.info("filtering by z-score")
551555
to_remove = []
552556
if perchr:
553557
for chrname in list(hic_ma.interval_trees):
@@ -575,6 +579,7 @@ def filter_by_zscore(hic_ma, lower_threshold, upper_threshold, perchr=False):
575579
"\n".format(chrname, lower_threshold, upper_threshold))
576580

577581
to_remove.extend(problematic)
582+
578583
else:
579584
row_sum = np.asarray(hic_ma.matrix.sum(axis=1)).flatten()
580585
# subtract from row sum, the diagonal
@@ -584,7 +589,6 @@ def filter_by_zscore(hic_ma, lower_threshold, upper_threshold, perchr=False):
584589
mad = MAD(row_sum)
585590
to_remove = np.flatnonzero(mad.is_outlier(
586591
lower_threshold, upper_threshold))
587-
588592
return sorted(to_remove)
589593

590594

@@ -658,6 +662,12 @@ def main(args=None):
658662
restore_masked_bins=False)
659663

660664
assert matrix_shape == ma.matrix.shape
665+
666+
if args.filteredBed:
667+
with open(args.filteredBed, 'w') as f:
668+
for outlier_region in set(outlier_regions):
669+
interval = ma.cut_intervals[outlier_region]
670+
f.write(f"{interval[0]}\t{interval[1]}\t{interval[2]}\t.\t{interval[3]}\t.\n")
661671
# mask filtered regions
662672
ma.maskBins(outlier_regions)
663673
total_filtered_out = set(outlier_regions)

hicexplorer/test/general/test_chicAggregateStatistic.py

+58-18
Original file line numberDiff line numberDiff line change
@@ -113,41 +113,81 @@ def test_regular_mode_threads():
113113
aggregateFileH5Object.close()
114114

115115

116-
def test_target_list():
116+
def test_target_list_bed3():
117117
outfile_aggregate = NamedTemporaryFile(suffix='.hdf5', delete=False)
118118
outfile_aggregate.close()
119119
args = "--interactionFile {} --targetFile {} --outFileName {} \
120-
-t {}".format(ROOT + 'chicViewpoint/two_matrices.hdf5',
121-
ROOT + 'chicAggregateStatistic/target_list.bed',
120+
-t {}".format(ROOT + 'chicViewpoint/two_matrices_custom_keys.hdf5',
121+
ROOT + 'chicAggregateStatistic/target_list_3col.bed',
122122
outfile_aggregate.name, 10).split()
123123
chicAggregateStatistic.main(args)
124124

125125
aggregateFileH5Object = h5py.File(outfile_aggregate.name, 'r')
126-
assert 'FL-E13-5_chr1_MB-E10-5_chr1' in aggregateFileH5Object
127-
assert 'FL-E13-5_chr1' in aggregateFileH5Object['FL-E13-5_chr1_MB-E10-5_chr1']
128-
assert 'MB-E10-5_chr1' in aggregateFileH5Object['FL-E13-5_chr1_MB-E10-5_chr1']
126+
assert 'c_adj_norm_t_adj_norm' in aggregateFileH5Object
127+
assert 'c_adj_norm' in aggregateFileH5Object['c_adj_norm_t_adj_norm']
128+
assert 't_adj_norm' in aggregateFileH5Object['c_adj_norm_t_adj_norm']
129129

130-
assert 'genes' in aggregateFileH5Object['FL-E13-5_chr1_MB-E10-5_chr1']['FL-E13-5_chr1']
131-
assert 'genes' in aggregateFileH5Object['FL-E13-5_chr1_MB-E10-5_chr1']['MB-E10-5_chr1']
130+
assert 'genes' in aggregateFileH5Object['c_adj_norm_t_adj_norm']['c_adj_norm']
131+
assert 'genes' in aggregateFileH5Object['c_adj_norm_t_adj_norm']['t_adj_norm']
132132
assert len(aggregateFileH5Object) == 1
133133
assert aggregateFileH5Object.attrs['type'] == 'aggregate'
134134

135-
for chromosome in aggregateFileH5Object['FL-E13-5_chr1_MB-E10-5_chr1']['FL-E13-5_chr1']:
135+
for chromosome in aggregateFileH5Object['c_adj_norm_t_adj_norm']['c_adj_norm']:
136136

137-
assert len(aggregateFileH5Object['FL-E13-5_chr1_MB-E10-5_chr1']['FL-E13-5_chr1'][chromosome]) == 3
137+
assert len(aggregateFileH5Object['c_adj_norm_t_adj_norm']['c_adj_norm'][chromosome]) == 3
138138

139-
for gene in aggregateFileH5Object['FL-E13-5_chr1_MB-E10-5_chr1']['FL-E13-5_chr1'][chromosome]:
140-
assert len(aggregateFileH5Object['FL-E13-5_chr1_MB-E10-5_chr1']['FL-E13-5_chr1'][chromosome][gene]) == 7
141-
for data in aggregateFileH5Object['FL-E13-5_chr1_MB-E10-5_chr1']['FL-E13-5_chr1'][chromosome][gene]:
139+
for gene in aggregateFileH5Object['c_adj_norm_t_adj_norm']['c_adj_norm'][chromosome]:
140+
assert len(aggregateFileH5Object['c_adj_norm_t_adj_norm']['c_adj_norm'][chromosome][gene]) == 7
141+
for data in aggregateFileH5Object['c_adj_norm_t_adj_norm']['c_adj_norm'][chromosome][gene]:
142142
assert data in ['chromosome', 'end_list', 'gene_name', 'raw_target_list', 'relative_distance_list', 'start_list', 'sum_of_interactions']
143143

144-
for chromosome in aggregateFileH5Object['FL-E13-5_chr1_MB-E10-5_chr1']['MB-E10-5_chr1']:
144+
for chromosome in aggregateFileH5Object['c_adj_norm_t_adj_norm']['t_adj_norm']:
145145

146-
assert len(aggregateFileH5Object['FL-E13-5_chr1_MB-E10-5_chr1']['MB-E10-5_chr1'][chromosome]) == 3
146+
assert len(aggregateFileH5Object['c_adj_norm_t_adj_norm']['t_adj_norm'][chromosome]) == 3
147147

148-
for gene in aggregateFileH5Object['FL-E13-5_chr1_MB-E10-5_chr1']['MB-E10-5_chr1'][chromosome]:
149-
assert len(aggregateFileH5Object['FL-E13-5_chr1_MB-E10-5_chr1']['MB-E10-5_chr1'][chromosome][gene]) == 7
150-
for data in aggregateFileH5Object['FL-E13-5_chr1_MB-E10-5_chr1']['MB-E10-5_chr1'][chromosome][gene]:
148+
for gene in aggregateFileH5Object['c_adj_norm_t_adj_norm']['t_adj_norm'][chromosome]:
149+
assert len(aggregateFileH5Object['c_adj_norm_t_adj_norm']['t_adj_norm'][chromosome][gene]) == 7
150+
for data in aggregateFileH5Object['c_adj_norm_t_adj_norm']['t_adj_norm'][chromosome][gene]:
151+
assert data in ['chromosome', 'end_list', 'gene_name', 'raw_target_list', 'relative_distance_list', 'start_list', 'sum_of_interactions']
152+
153+
aggregateFileH5Object.close()
154+
155+
156+
def test_target_list_bed4():
157+
outfile_aggregate = NamedTemporaryFile(suffix='.hdf5', delete=False)
158+
outfile_aggregate.close()
159+
args = "--interactionFile {} --targetFile {} --outFileName {} \
160+
-t {}".format(ROOT + 'chicViewpoint/two_matrices_custom_keys.hdf5',
161+
ROOT + 'chicAggregateStatistic/target_list_4col.bed',
162+
outfile_aggregate.name, 10).split()
163+
chicAggregateStatistic.main(args)
164+
165+
aggregateFileH5Object = h5py.File(outfile_aggregate.name, 'r')
166+
assert 'c_adj_norm_t_adj_norm' in aggregateFileH5Object
167+
assert 'c_adj_norm' in aggregateFileH5Object['c_adj_norm_t_adj_norm']
168+
assert 't_adj_norm' in aggregateFileH5Object['c_adj_norm_t_adj_norm']
169+
170+
assert 'genes' in aggregateFileH5Object['c_adj_norm_t_adj_norm']['c_adj_norm']
171+
assert 'genes' in aggregateFileH5Object['c_adj_norm_t_adj_norm']['t_adj_norm']
172+
assert len(aggregateFileH5Object) == 1
173+
assert aggregateFileH5Object.attrs['type'] == 'aggregate'
174+
175+
for chromosome in aggregateFileH5Object['c_adj_norm_t_adj_norm']['c_adj_norm']:
176+
177+
assert len(aggregateFileH5Object['c_adj_norm_t_adj_norm']['c_adj_norm'][chromosome]) == 3
178+
179+
for gene in aggregateFileH5Object['c_adj_norm_t_adj_norm']['c_adj_norm'][chromosome]:
180+
assert len(aggregateFileH5Object['c_adj_norm_t_adj_norm']['c_adj_norm'][chromosome][gene]) == 7
181+
for data in aggregateFileH5Object['c_adj_norm_t_adj_norm']['c_adj_norm'][chromosome][gene]:
182+
assert data in ['chromosome', 'end_list', 'gene_name', 'raw_target_list', 'relative_distance_list', 'start_list', 'sum_of_interactions']
183+
184+
for chromosome in aggregateFileH5Object['c_adj_norm_t_adj_norm']['t_adj_norm']:
185+
186+
assert len(aggregateFileH5Object['c_adj_norm_t_adj_norm']['t_adj_norm'][chromosome]) == 3
187+
188+
for gene in aggregateFileH5Object['c_adj_norm_t_adj_norm']['t_adj_norm'][chromosome]:
189+
assert len(aggregateFileH5Object['c_adj_norm_t_adj_norm']['t_adj_norm'][chromosome][gene]) == 7
190+
for data in aggregateFileH5Object['c_adj_norm_t_adj_norm']['t_adj_norm'][chromosome][gene]:
151191
assert data in ['chromosome', 'end_list', 'gene_name', 'raw_target_list', 'relative_distance_list', 'start_list', 'sum_of_interactions']
152192

153193
aggregateFileH5Object.close()

hicexplorer/test/general/test_hicCorrectMatrix.py

+16-2
Original file line numberDiff line numberDiff line change
@@ -16,21 +16,35 @@
1616
os.path.dirname(os.path.abspath(__file__))), "test_data/")
1717

1818

19+
def are_files_equal(file1, file2, pDifference=1):
20+
with open(file1) as textfile1, open(file2) as textfile2:
21+
for x, y in zip(textfile1, textfile2):
22+
if x != y:
23+
count = sum(1 for a, b in zip(x, y) if a != b)
24+
if count > pDifference:
25+
return False
26+
return True
27+
28+
1929
def test_correct_matrix_ICE():
2030
outfile = NamedTemporaryFile(suffix='.ICE.h5', delete=False)
2131
outfile.close()
2232

33+
outfile_filtered = NamedTemporaryFile(suffix='.bed', delete=True)
34+
2335
args = "correct --matrix {} --correctionMethod ICE --chromosomes "\
24-
"chrUextra chr3LHet --iterNum 500 --outFileName {} "\
36+
"chrUextra chr3LHet --iterNum 500 --outFileName {} --filteredBed {} "\
2537
"--filterThreshold -1.5 5.0".format(ROOT + "small_test_matrix.h5",
26-
outfile.name).split()
38+
outfile.name,
39+
outfile_filtered.name).split()
2740
# hicCorrectMatrix.main(args)
2841
compute(hicCorrectMatrix.main, args, 5)
2942
test = hm.hiCMatrix(
3043
ROOT + "hicCorrectMatrix/small_test_matrix_ICEcorrected_chrUextra_chr3LHet.h5")
3144
new = hm.hiCMatrix(outfile.name)
3245
nt.assert_equal(test.matrix.data, new.matrix.data)
3346
nt.assert_equal(test.cut_intervals, new.cut_intervals)
47+
assert are_files_equal(outfile_filtered.name, ROOT + 'hicCorrectMatrix/filtered.bed')
3448

3549
os.unlink(outfile.name)
3650

Binary file not shown.

hicexplorer/test/test_data/cHi-C/chicAggregateStatistic/target_list.bed

-8
This file was deleted.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
chr1 4480000 4549000
2+
chr1 4555000 4688000
3+
chr1 14274000 14279000
4+
chr1 14290000 14439000
5+
chr1 14444000 14467000
6+
chr1 14476000 14501000
7+
chr1 19077000 19118000
8+
chr1 19120000 19274000
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
chr1 4487335 4487535 Sox17
2+
chr1 14300180 14300380 Eya1
3+
chr1 19093003 19093203 Tfap2d
Binary file not shown.

0 commit comments

Comments
 (0)