Skip to content

Commit 1a914fa

Browse files
Add indexing
1 parent de91b0c commit 1a914fa

File tree

1 file changed

+54
-11
lines changed

1 file changed

+54
-11
lines changed

src/result_analyzer/large_scale_analysis.py

+54-11
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
11
import os
22
import json
33
import logging
4-
from concurrent.futures import ProcessPoolExecutor
4+
from concurrent.futures import ProcessPoolExecutor, as_completed
55
from analysis_utils import format_type
66
from tqdm import tqdm
77
from multiprocessing import cpu_count
88
from threading import Lock
9+
from collections import defaultdict
910

1011
SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
1112
TEST_DIR = os.path.join(
@@ -122,11 +123,14 @@ def load_and_sort_json(file_path):
122123

123124
def measure_exact_matches(out, expected, tool_name=None, print_missed=False):
124125
"""
125-
Measure exact and partial matches between two JSON files.
126+
Measure exact and partial matches between two JSON files using indexing for efficiency.
126127
"""
127128
data_out = load_and_sort_json(out)
128129
data_expected = load_and_sort_json(expected)
129130

131+
# Create index for data_out
132+
index = create_index(data_out)
133+
130134
results = {
131135
"num_all": len(data_expected),
132136
"num_caught_exact": 0,
@@ -138,14 +142,17 @@ def measure_exact_matches(out, expected, tool_name=None, print_missed=False):
138142

139143
# Process comparisons in parallel
140144
with ProcessPoolExecutor(max_workers=max(cpu_count() - 1, 1)) as executor:
141-
futures = []
142-
for fact_expected in data_expected:
143-
futures.append(
144-
executor.submit(process_fact_comparison, fact_expected, data_out)
145-
)
146-
147-
for future in futures:
148-
fact_expected = data_expected[futures.index(future)]
145+
futures = {
146+
executor.submit(
147+
process_fact_comparison_with_index, fact_expected, index
148+
): fact_expected
149+
for fact_expected in data_expected
150+
}
151+
152+
for future in tqdm(
153+
as_completed(futures), total=len(futures), desc="Matching facts"
154+
):
155+
fact_expected = futures[future] # Retrieve the corresponding fact
149156
try:
150157
is_exact_match, is_partial_match = future.result()
151158
with lock:
@@ -155,9 +162,10 @@ def measure_exact_matches(out, expected, tool_name=None, print_missed=False):
155162
results["num_caught_partial"] += 1
156163
elif print_missed:
157164
log_missed_fact(tool_name, fact_expected)
158-
progress_bar.update(1)
159165
except Exception as e:
160166
logging.error(f"Error processing fact: {fact_expected} - {e}")
167+
finally:
168+
progress_bar.update(1)
161169

162170
progress_bar.close()
163171
return results
@@ -183,6 +191,41 @@ def process_fact_comparison(fact_expected, data_out):
183191
return is_exact_match, is_partial_match
184192

185193

194+
def create_index(data_out):
195+
"""
196+
Create an index for data_out based on (file, line_number) and optionally other fields.
197+
"""
198+
index = defaultdict(list)
199+
for fact_out in data_out:
200+
key = (fact_out.get("file"), fact_out.get("line_number"))
201+
index[key].append(fact_out)
202+
return index
203+
204+
205+
def process_fact_comparison_with_index(fact_expected, index):
206+
"""
207+
Compare a single fact against indexed output facts for matches.
208+
"""
209+
is_exact_match = False
210+
is_partial_match = False
211+
212+
# Get the relevant facts from the index
213+
key = (fact_expected.get("file"), fact_expected.get("line_number"))
214+
relevant_facts = index.get(key, [])
215+
216+
# Compare only relevant facts
217+
for fact_out in relevant_facts:
218+
exact_match, partial_match = check_match(fact_expected, fact_out)
219+
is_exact_match = is_exact_match or exact_match
220+
is_partial_match = is_partial_match or partial_match
221+
222+
# Break early if both matches are found
223+
if is_exact_match and is_partial_match:
224+
break
225+
226+
return is_exact_match, is_partial_match
227+
228+
186229
def log_missed_fact(tool_name, fact_expected):
187230
"""
188231
Log missed facts to a CSV file for further analysis.

0 commit comments

Comments
 (0)