Skip to content

Commit 4a66e68

Browse files
committed
utils: update_fixtures_cross_sections.py
1 parent d643ff3 commit 4a66e68

File tree

1 file changed

+243
-0
lines changed

1 file changed

+243
-0
lines changed
Lines changed: 243 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,243 @@
1+
#!/usr/bin/env python3
2+
3+
"""Helper script for creating cross_section JSON field of simulated dataset record fixutres.
4+
5+
This helper script is useful for creating/updating the cross_sections JSON field in
6+
the CMS 2015 simulated datasets found in the CERN Open Data record fixtures.
7+
8+
"""
9+
10+
import os
11+
import subprocess
12+
import click
13+
import json
14+
15+
16+
@click.command()
17+
@click.option(
18+
"--cross_sections_path",
19+
"-c",
20+
required=True,
21+
help="Relative path to the cross-section values json files directory",
22+
)
23+
@click.option(
24+
"--input_path", "-i", required=True, help="Relative path to the input directory"
25+
)
26+
@click.option(
27+
"--output_path", "-o", required=True, help="Relative path to the output directory"
28+
)
29+
def main(cross_sections_path, input_path, output_path): # noqa: D301,D412
30+
"""Update datasets to include the cross_sections JSON field.
31+
32+
Update datasets found at input_path to include the cross_sections JSON field
33+
and store the updated datasets at output_path.
34+
35+
Example:
36+
37+
\b
38+
$ ./utils/update_fixtures_cross_sections.py \\
39+
-c ../MC2015/StandardModelPhysics
40+
-i ../opendata.cern.ch/cernopendata/modules/fixtures/data/records \\
41+
-o ../opendata.cern.ch/cernopendata/modules/fixtures/data/records
42+
"""
43+
# rename cross-section values json files to their corresponding dataset names to make rest of code simpler
44+
total_cross_section_files = 0
45+
sub_categories = os.listdir(cross_sections_path)
46+
for categ in sub_categories:
47+
for json_file_name in os.listdir(f"{cross_sections_path}/{categ}"):
48+
total_cross_section_files += 1
49+
json_file = open(f"{cross_sections_path}/{categ}/{json_file_name}", "r")
50+
json_file_content = json_file.read()
51+
json_file.close()
52+
53+
json_record = json.loads(json_file_content)
54+
dataset = json_record[0]["metadata"]["Dataset"]
55+
56+
new_file_name = f"{dataset.replace('/', '$')}.json"
57+
if new_file_name[0] != "$":
58+
new_file_name = "$" + new_file_name
59+
os.rename(
60+
f"{cross_sections_path}/{categ}/{json_file_name}",
61+
f"{cross_sections_path}/{categ}/{new_file_name}",
62+
)
63+
64+
# find paths to all datasets that need to be updated
65+
find_datasets_cmd = (
66+
f'find {input_path} -type f -name "cms-simulated-datasets-2015*.json"'
67+
)
68+
target_datasets_paths = subprocess.getoutput(find_datasets_cmd).split("\n")
69+
70+
total_datasets_amended = 0
71+
total_format1 = 0
72+
total_format2 = 0
73+
total_format3 = 0
74+
total_format4 = 0
75+
total_format5 = 0
76+
total_format6 = 0
77+
78+
# amend target records of all target datasets
79+
for target_dataset_path in target_datasets_paths:
80+
# read target records
81+
target_dataset_basename = os.path.basename(target_dataset_path)[: -len(".json")]
82+
target_dataset_file = open(target_dataset_path, "r")
83+
target_dataset_content = target_dataset_file.read()
84+
target_dataset_file.close()
85+
target_records = json.loads(target_dataset_content)
86+
print(f"Processing {target_dataset_basename}...")
87+
88+
# add cross_section metadata field
89+
new_target_records = []
90+
for record in target_records:
91+
# find the record's corresponding cross-section values json file
92+
cross_sections_file_name = record["title"].replace("/", "$")
93+
find_cross_sections_cmd = (
94+
f"find {cross_sections_path} -name '{cross_sections_file_name}.json'"
95+
)
96+
cross_sections_file = subprocess.getoutput(find_cross_sections_cmd)
97+
98+
if not cross_sections_file:
99+
new_target_records.append(record)
100+
continue
101+
102+
cross_sections_json_file = open(f"{cross_sections_file}", "r")
103+
cross_sections_json_content = cross_sections_json_file.read()
104+
cross_sections_json_file.close()
105+
cross_sections_json_record = json.loads(cross_sections_json_content)
106+
cross_sections_json_data = cross_sections_json_record[1]
107+
108+
record["cross_section"] = {}
109+
# check the presence of certain attributes to identify the format the file is in
110+
# see: https://github.com/Ari-mu-l/OpenData/tree/main
111+
# Format 1
112+
if (
113+
"totX_beforeMat" in cross_sections_json_data
114+
and "matchingEff" in cross_sections_json_data
115+
):
116+
total_format1 += 1
117+
record["cross_section"]["total_value"] = cross_sections_json_data[
118+
"totX_final"
119+
]
120+
record["cross_section"][
121+
"total_value_uncertainty"
122+
] = cross_sections_json_data["totX_final_err"]
123+
record["cross_section"][
124+
"matching_efficiency"
125+
] = cross_sections_json_data["matchingEff"]
126+
record["cross_section"]["filter_efficiency"] = cross_sections_json_data[
127+
"filterEff_weights"
128+
]
129+
record["cross_section"][
130+
"neg_weight_fraction"
131+
] = cross_sections_json_data["negWeightFrac"]
132+
# Format 2
133+
elif "totX_beforeMat" in cross_sections_json_data:
134+
total_format2 += 1
135+
record["cross_section"]["total_value"] = cross_sections_json_data[
136+
"totX_final"
137+
]
138+
record["cross_section"][
139+
"total_value_uncertainty"
140+
] = cross_sections_json_data["totX_final_err"]
141+
record["cross_section"]["matching_efficiency"] = ""
142+
record["cross_section"]["filter_efficiency"] = cross_sections_json_data[
143+
"filterEff_weights"
144+
]
145+
record["cross_section"]["neg_weight_fraction"] = ""
146+
# Format 3
147+
elif (
148+
"totX_beforeFilter" in cross_sections_json_data
149+
and "negWeightFrac" in cross_sections_json_data
150+
):
151+
total_format3 += 1
152+
record["cross_section"]["total_value"] = cross_sections_json_data[
153+
"totX_final"
154+
]
155+
record["cross_section"][
156+
"total_value_uncertainty"
157+
] = cross_sections_json_data["totX_final_err"]
158+
record["cross_section"]["matching_efficiency"] = ""
159+
record["cross_section"]["filter_efficiency"] = cross_sections_json_data[
160+
"filterEff_weights"
161+
]
162+
record["cross_section"][
163+
"neg_weight_fraction"
164+
] = cross_sections_json_data["negWeightFrac"]
165+
# Format 6 (unlisted format, but it is there in some json files)
166+
elif "filterEff(weights)" in cross_sections_json_data:
167+
total_format6 += 1
168+
record["cross_section"]["total_value"] = cross_sections_json_data[
169+
"totX_final"
170+
]
171+
record["cross_section"][
172+
"total_value_uncertainty"
173+
] = cross_sections_json_data["totX_final_err"]
174+
record["cross_section"]["matching_efficiency"] = ""
175+
record["cross_section"]["filter_efficiency"] = cross_sections_json_data[
176+
"filterEff(weights)"
177+
]
178+
record["cross_section"]["neg_weight_fraction"] = ""
179+
# Format 4
180+
elif "totX_beforeFilter" in cross_sections_json_data:
181+
total_format4 += 1
182+
record["cross_section"]["total_value"] = cross_sections_json_data[
183+
"totX_final"
184+
]
185+
record["cross_section"][
186+
"total_value_uncertainty"
187+
] = cross_sections_json_data["totX_final_err"]
188+
record["cross_section"]["matching_efficiency"] = ""
189+
record["cross_section"]["filter_efficiency"] = cross_sections_json_data[
190+
"filterEff_weights"
191+
]
192+
record["cross_section"]["neg_weight_fraction"] = ""
193+
# Format 5
194+
else:
195+
total_format5 += 1
196+
record["cross_section"]["total_value"] = cross_sections_json_data[
197+
"totX_final"
198+
]
199+
record["cross_section"][
200+
"total_value_uncertainty"
201+
] = cross_sections_json_data["totX_final_err"]
202+
record["cross_section"]["matching_efficiency"] = ""
203+
record["cross_section"]["filter_efficiency"] = cross_sections_json_data[
204+
"filterEff_weights"
205+
]
206+
record["cross_section"]["neg_weight_fraction"] = ""
207+
208+
new_target_records.append(record)
209+
total_datasets_amended += 1
210+
211+
# save the amended dataset
212+
new_dataset_json = json.dumps(
213+
new_target_records,
214+
indent=2,
215+
sort_keys=True,
216+
ensure_ascii=False,
217+
separators=(",", ": "),
218+
)
219+
220+
updated_dataset_path = f"{output_path}/{target_dataset_basename}.json"
221+
new_dataset_file = open(updated_dataset_path, "w")
222+
new_dataset_file.write(new_dataset_json + "\n")
223+
new_dataset_file.close()
224+
225+
# clean resulting JSON file
226+
if os.path.exists("../opendata.cern.ch/scripts/clean_json_file.py"):
227+
os.system(
228+
f"../opendata.cern.ch/scripts/clean_json_file.py {updated_dataset_path}"
229+
)
230+
231+
print(
232+
f"Total number of cross-section values json files: {total_cross_section_files}, Total number of amended datasets: {total_datasets_amended}"
233+
)
234+
print(f"Total number of datasets amended using Format 1: {total_format1}")
235+
print(f"Total number of datasets amended using Format 2: {total_format2}")
236+
print(f"Total number of datasets amended using Format 3: {total_format3}")
237+
print(f"Total number of datasets amended using Format 4: {total_format4}")
238+
print(f"Total number of datasets amended using Format 5: {total_format5}")
239+
print(f"Total number of datasets amended using Format 6: {total_format6}")
240+
241+
242+
if __name__ == "__main__":
243+
main()

0 commit comments

Comments
 (0)