1
1
import os
2
2
import json
3
3
import logging
4
- from concurrent .futures import ProcessPoolExecutor
4
+ from concurrent .futures import ProcessPoolExecutor , as_completed
5
5
from analysis_utils import format_type
6
6
from tqdm import tqdm
7
7
from multiprocessing import cpu_count
8
8
from threading import Lock
9
+ from collections import defaultdict
9
10
10
11
SCRIPT_DIR = os .path .dirname (os .path .realpath (__file__ ))
11
12
TEST_DIR = os .path .join (
@@ -122,11 +123,14 @@ def load_and_sort_json(file_path):
122
123
123
124
def measure_exact_matches (out , expected , tool_name = None , print_missed = False ):
124
125
"""
125
- Measure exact and partial matches between two JSON files.
126
+ Measure exact and partial matches between two JSON files using indexing for efficiency .
126
127
"""
127
128
data_out = load_and_sort_json (out )
128
129
data_expected = load_and_sort_json (expected )
129
130
131
+ # Create index for data_out
132
+ index = create_index (data_out )
133
+
130
134
results = {
131
135
"num_all" : len (data_expected ),
132
136
"num_caught_exact" : 0 ,
@@ -138,14 +142,17 @@ def measure_exact_matches(out, expected, tool_name=None, print_missed=False):
138
142
139
143
# Process comparisons in parallel
140
144
with ProcessPoolExecutor (max_workers = max (cpu_count () - 1 , 1 )) as executor :
141
- futures = []
142
- for fact_expected in data_expected :
143
- futures .append (
144
- executor .submit (process_fact_comparison , fact_expected , data_out )
145
- )
146
-
147
- for future in futures :
148
- fact_expected = data_expected [futures .index (future )]
145
+ futures = {
146
+ executor .submit (
147
+ process_fact_comparison_with_index , fact_expected , index
148
+ ): fact_expected
149
+ for fact_expected in data_expected
150
+ }
151
+
152
+ for future in tqdm (
153
+ as_completed (futures ), total = len (futures ), desc = "Matching facts"
154
+ ):
155
+ fact_expected = futures [future ] # Retrieve the corresponding fact
149
156
try :
150
157
is_exact_match , is_partial_match = future .result ()
151
158
with lock :
@@ -155,9 +162,10 @@ def measure_exact_matches(out, expected, tool_name=None, print_missed=False):
155
162
results ["num_caught_partial" ] += 1
156
163
elif print_missed :
157
164
log_missed_fact (tool_name , fact_expected )
158
- progress_bar .update (1 )
159
165
except Exception as e :
160
166
logging .error (f"Error processing fact: { fact_expected } - { e } " )
167
+ finally :
168
+ progress_bar .update (1 )
161
169
162
170
progress_bar .close ()
163
171
return results
@@ -183,6 +191,41 @@ def process_fact_comparison(fact_expected, data_out):
183
191
return is_exact_match , is_partial_match
184
192
185
193
194
+ def create_index (data_out ):
195
+ """
196
+ Create an index for data_out based on (file, line_number) and optionally other fields.
197
+ """
198
+ index = defaultdict (list )
199
+ for fact_out in data_out :
200
+ key = (fact_out .get ("file" ), fact_out .get ("line_number" ))
201
+ index [key ].append (fact_out )
202
+ return index
203
+
204
+
205
+ def process_fact_comparison_with_index (fact_expected , index ):
206
+ """
207
+ Compare a single fact against indexed output facts for matches.
208
+ """
209
+ is_exact_match = False
210
+ is_partial_match = False
211
+
212
+ # Get the relevant facts from the index
213
+ key = (fact_expected .get ("file" ), fact_expected .get ("line_number" ))
214
+ relevant_facts = index .get (key , [])
215
+
216
+ # Compare only relevant facts
217
+ for fact_out in relevant_facts :
218
+ exact_match , partial_match = check_match (fact_expected , fact_out )
219
+ is_exact_match = is_exact_match or exact_match
220
+ is_partial_match = is_partial_match or partial_match
221
+
222
+ # Break early if both matches are found
223
+ if is_exact_match and is_partial_match :
224
+ break
225
+
226
+ return is_exact_match , is_partial_match
227
+
228
+
186
229
def log_missed_fact (tool_name , fact_expected ):
187
230
"""
188
231
Log missed facts to a CSV file for further analysis.
0 commit comments