|
| 1 | +#!/usr/bin/env python3 |
| 2 | + |
| 3 | +"""Helper script for creating cross_section JSON field of simulated dataset record fixutres. |
| 4 | +
|
| 5 | +This helper script is useful for creating/updating the cross_sections JSON field in |
| 6 | +the CMS 2015 simulated datasets found in the CERN Open Data record fixtures. |
| 7 | +
|
| 8 | +""" |
| 9 | + |
| 10 | +import os |
| 11 | +import subprocess |
| 12 | +import click |
| 13 | +import json |
| 14 | + |
| 15 | + |
| 16 | +@click.command() |
| 17 | +@click.option( |
| 18 | + "--cross_sections_path", |
| 19 | + "-c", |
| 20 | + required=True, |
| 21 | + help="Relative path to the cross-section values json files directory", |
| 22 | +) |
| 23 | +@click.option( |
| 24 | + "--input_path", "-i", required=True, help="Relative path to the input directory" |
| 25 | +) |
| 26 | +@click.option( |
| 27 | + "--output_path", "-o", required=True, help="Relative path to the output directory" |
| 28 | +) |
| 29 | +def main(cross_sections_path, input_path, output_path): # noqa: D301,D412 |
| 30 | + """Update datasets to include the cross_sections JSON field. |
| 31 | +
|
| 32 | + Update datasets found at input_path to include the cross_sections JSON field |
| 33 | + and store the updated datasets at output_path. |
| 34 | +
|
| 35 | + Example: |
| 36 | +
|
| 37 | + \b |
| 38 | + $ ./utils/update_fixtures_cross_sections.py \\ |
| 39 | + -c ../MC2015/StandardModelPhysics |
| 40 | + -i ../opendata.cern.ch/cernopendata/modules/fixtures/data/records \\ |
| 41 | + -o ../opendata.cern.ch/cernopendata/modules/fixtures/data/records |
| 42 | + """ |
| 43 | + # rename cross-section values json files to their corresponding dataset names to make rest of code simpler |
| 44 | + total_cross_section_files = 0 |
| 45 | + sub_categories = os.listdir(cross_sections_path) |
| 46 | + for categ in sub_categories: |
| 47 | + for json_file_name in os.listdir(f"{cross_sections_path}/{categ}"): |
| 48 | + total_cross_section_files += 1 |
| 49 | + json_file = open(f"{cross_sections_path}/{categ}/{json_file_name}", "r") |
| 50 | + json_file_content = json_file.read() |
| 51 | + json_file.close() |
| 52 | + |
| 53 | + json_record = json.loads(json_file_content) |
| 54 | + dataset = json_record[0]["metadata"]["Dataset"] |
| 55 | + |
| 56 | + new_file_name = f"{dataset.replace('/', '$')}.json" |
| 57 | + if new_file_name[0] != "$": |
| 58 | + new_file_name = "$" + new_file_name |
| 59 | + os.rename( |
| 60 | + f"{cross_sections_path}/{categ}/{json_file_name}", |
| 61 | + f"{cross_sections_path}/{categ}/{new_file_name}", |
| 62 | + ) |
| 63 | + |
| 64 | + # find paths to all datasets that need to be updated |
| 65 | + find_datasets_cmd = ( |
| 66 | + f'find {input_path} -type f -name "cms-simulated-datasets-2015*.json"' |
| 67 | + ) |
| 68 | + target_datasets_paths = subprocess.getoutput(find_datasets_cmd).split("\n") |
| 69 | + |
| 70 | + total_datasets_amended = 0 |
| 71 | + total_format1 = 0 |
| 72 | + total_format2 = 0 |
| 73 | + total_format3 = 0 |
| 74 | + total_format4 = 0 |
| 75 | + total_format5 = 0 |
| 76 | + total_format6 = 0 |
| 77 | + |
| 78 | + # amend target records of all target datasets |
| 79 | + for target_dataset_path in target_datasets_paths: |
| 80 | + # read target records |
| 81 | + target_dataset_basename = os.path.basename(target_dataset_path)[: -len(".json")] |
| 82 | + target_dataset_file = open(target_dataset_path, "r") |
| 83 | + target_dataset_content = target_dataset_file.read() |
| 84 | + target_dataset_file.close() |
| 85 | + target_records = json.loads(target_dataset_content) |
| 86 | + print(f"Processing {target_dataset_basename}...") |
| 87 | + |
| 88 | + # add cross_section metadata field |
| 89 | + new_target_records = [] |
| 90 | + for record in target_records: |
| 91 | + # find the record's corresponding cross-section values json file |
| 92 | + cross_sections_file_name = record["title"].replace("/", "$") |
| 93 | + find_cross_sections_cmd = ( |
| 94 | + f"find {cross_sections_path} -name '{cross_sections_file_name}.json'" |
| 95 | + ) |
| 96 | + cross_sections_file = subprocess.getoutput(find_cross_sections_cmd) |
| 97 | + |
| 98 | + if not cross_sections_file: |
| 99 | + new_target_records.append(record) |
| 100 | + continue |
| 101 | + |
| 102 | + cross_sections_json_file = open(f"{cross_sections_file}", "r") |
| 103 | + cross_sections_json_content = cross_sections_json_file.read() |
| 104 | + cross_sections_json_file.close() |
| 105 | + cross_sections_json_record = json.loads(cross_sections_json_content) |
| 106 | + cross_sections_json_data = cross_sections_json_record[1] |
| 107 | + |
| 108 | + record["cross_section"] = {} |
| 109 | + # check the presence of certain attributes to identify the format the file is in |
| 110 | + # see: https://github.com/Ari-mu-l/OpenData/tree/main |
| 111 | + # Format 1 |
| 112 | + if ( |
| 113 | + "totX_beforeMat" in cross_sections_json_data |
| 114 | + and "matchingEff" in cross_sections_json_data |
| 115 | + ): |
| 116 | + total_format1 += 1 |
| 117 | + record["cross_section"]["total_value"] = cross_sections_json_data[ |
| 118 | + "totX_final" |
| 119 | + ] |
| 120 | + record["cross_section"][ |
| 121 | + "total_value_uncertainty" |
| 122 | + ] = cross_sections_json_data["totX_final_err"] |
| 123 | + record["cross_section"][ |
| 124 | + "matching_efficiency" |
| 125 | + ] = cross_sections_json_data["matchingEff"] |
| 126 | + record["cross_section"]["filter_efficiency"] = cross_sections_json_data[ |
| 127 | + "filterEff_weights" |
| 128 | + ] |
| 129 | + record["cross_section"][ |
| 130 | + "neg_weight_fraction" |
| 131 | + ] = cross_sections_json_data["negWeightFrac"] |
| 132 | + # Format 2 |
| 133 | + elif "totX_beforeMat" in cross_sections_json_data: |
| 134 | + total_format2 += 1 |
| 135 | + record["cross_section"]["total_value"] = cross_sections_json_data[ |
| 136 | + "totX_final" |
| 137 | + ] |
| 138 | + record["cross_section"][ |
| 139 | + "total_value_uncertainty" |
| 140 | + ] = cross_sections_json_data["totX_final_err"] |
| 141 | + record["cross_section"]["matching_efficiency"] = "" |
| 142 | + record["cross_section"]["filter_efficiency"] = cross_sections_json_data[ |
| 143 | + "filterEff_weights" |
| 144 | + ] |
| 145 | + record["cross_section"]["neg_weight_fraction"] = "" |
| 146 | + # Format 3 |
| 147 | + elif ( |
| 148 | + "totX_beforeFilter" in cross_sections_json_data |
| 149 | + and "negWeightFrac" in cross_sections_json_data |
| 150 | + ): |
| 151 | + total_format3 += 1 |
| 152 | + record["cross_section"]["total_value"] = cross_sections_json_data[ |
| 153 | + "totX_final" |
| 154 | + ] |
| 155 | + record["cross_section"][ |
| 156 | + "total_value_uncertainty" |
| 157 | + ] = cross_sections_json_data["totX_final_err"] |
| 158 | + record["cross_section"]["matching_efficiency"] = "" |
| 159 | + record["cross_section"]["filter_efficiency"] = cross_sections_json_data[ |
| 160 | + "filterEff_weights" |
| 161 | + ] |
| 162 | + record["cross_section"][ |
| 163 | + "neg_weight_fraction" |
| 164 | + ] = cross_sections_json_data["negWeightFrac"] |
| 165 | + # Format 6 (unlisted format, but it is there in some json files) |
| 166 | + elif "filterEff(weights)" in cross_sections_json_data: |
| 167 | + total_format6 += 1 |
| 168 | + record["cross_section"]["total_value"] = cross_sections_json_data[ |
| 169 | + "totX_final" |
| 170 | + ] |
| 171 | + record["cross_section"][ |
| 172 | + "total_value_uncertainty" |
| 173 | + ] = cross_sections_json_data["totX_final_err"] |
| 174 | + record["cross_section"]["matching_efficiency"] = "" |
| 175 | + record["cross_section"]["filter_efficiency"] = cross_sections_json_data[ |
| 176 | + "filterEff(weights)" |
| 177 | + ] |
| 178 | + record["cross_section"]["neg_weight_fraction"] = "" |
| 179 | + # Format 4 |
| 180 | + elif "totX_beforeFilter" in cross_sections_json_data: |
| 181 | + total_format4 += 1 |
| 182 | + record["cross_section"]["total_value"] = cross_sections_json_data[ |
| 183 | + "totX_final" |
| 184 | + ] |
| 185 | + record["cross_section"][ |
| 186 | + "total_value_uncertainty" |
| 187 | + ] = cross_sections_json_data["totX_final_err"] |
| 188 | + record["cross_section"]["matching_efficiency"] = "" |
| 189 | + record["cross_section"]["filter_efficiency"] = cross_sections_json_data[ |
| 190 | + "filterEff_weights" |
| 191 | + ] |
| 192 | + record["cross_section"]["neg_weight_fraction"] = "" |
| 193 | + # Format 5 |
| 194 | + else: |
| 195 | + total_format5 += 1 |
| 196 | + record["cross_section"]["total_value"] = cross_sections_json_data[ |
| 197 | + "totX_final" |
| 198 | + ] |
| 199 | + record["cross_section"][ |
| 200 | + "total_value_uncertainty" |
| 201 | + ] = cross_sections_json_data["totX_final_err"] |
| 202 | + record["cross_section"]["matching_efficiency"] = "" |
| 203 | + record["cross_section"]["filter_efficiency"] = cross_sections_json_data[ |
| 204 | + "filterEff_weights" |
| 205 | + ] |
| 206 | + record["cross_section"]["neg_weight_fraction"] = "" |
| 207 | + |
| 208 | + new_target_records.append(record) |
| 209 | + total_datasets_amended += 1 |
| 210 | + |
| 211 | + # save the amended dataset |
| 212 | + new_dataset_json = json.dumps( |
| 213 | + new_target_records, |
| 214 | + indent=2, |
| 215 | + sort_keys=True, |
| 216 | + ensure_ascii=False, |
| 217 | + separators=(",", ": "), |
| 218 | + ) |
| 219 | + |
| 220 | + updated_dataset_path = f"{output_path}/{target_dataset_basename}.json" |
| 221 | + new_dataset_file = open(updated_dataset_path, "w") |
| 222 | + new_dataset_file.write(new_dataset_json + "\n") |
| 223 | + new_dataset_file.close() |
| 224 | + |
| 225 | + # clean resulting JSON file |
| 226 | + if os.path.exists("../opendata.cern.ch/scripts/clean_json_file.py"): |
| 227 | + os.system( |
| 228 | + f"../opendata.cern.ch/scripts/clean_json_file.py {updated_dataset_path}" |
| 229 | + ) |
| 230 | + |
| 231 | + print( |
| 232 | + f"Total number of cross-section values json files: {total_cross_section_files}, Total number of amended datasets: {total_datasets_amended}" |
| 233 | + ) |
| 234 | + print(f"Total number of datasets amended using Format 1: {total_format1}") |
| 235 | + print(f"Total number of datasets amended using Format 2: {total_format2}") |
| 236 | + print(f"Total number of datasets amended using Format 3: {total_format3}") |
| 237 | + print(f"Total number of datasets amended using Format 4: {total_format4}") |
| 238 | + print(f"Total number of datasets amended using Format 5: {total_format5}") |
| 239 | + print(f"Total number of datasets amended using Format 6: {total_format6}") |
| 240 | + |
| 241 | + |
| 242 | +if __name__ == "__main__": |
| 243 | + main() |
0 commit comments