Skip to content

Commit b067d71

Browse files
committed
add support for mcdb and cleanup code
1 parent 590401f commit b067d71

File tree

1 file changed

+102
-8
lines changed

1 file changed

+102
-8
lines changed

cms-2016-simulated-datasets/code/lhe_generators.py

+102-8
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,8 @@
2020

2121
def log(recid, logtype, logmessage):
2222
"""Store a log message of a certain type to record-ID-based log file system."""
23-
logdir = f"./lhe_generators/2016-sim/gridpacks/{recid}"
23+
logdir = f'./lhe_generators/2016-sim/gridpacks/{recid}'
24+
2425
if not os.path.exists(logdir):
2526
os.makedirs(logdir)
2627
with open(f"{logdir}/LOG.txt", "a") as fdesc:
@@ -44,25 +45,116 @@ def get_lhe(dataset, mcm_dir):
4445

4546

4647
def cmd_run(cmds, recid):
48+
4749
for cmd in cmds:
4850
err = subprocess.run(
49-
cmd, shell=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE
51+
cmd,
52+
shell=True,
53+
stderr=subprocess.PIPE,
54+
stdout=subprocess.PIPE
5055
).stderr.decode()
56+
5157
if err:
5258
log(recid, "ERROR", f"Error {err}")
5359
return False
54-
return True
5560

61+
return True
5662

63+
5764
def create_lhe_generator(
5865
dataset, recid, mcm_dir, gen_store="./lhe_generators/2016-sim"
5966
):
60-
# mcm_dir is the directory of the LHE step
67+
'''
68+
mcm_dir is the directory of the LHE step
69+
'''
6170
mcdb_id = get_from_deep_json(get_mcm_dict(dataset, mcm_dir), "mcdb_id") or 0
62-
if mcdb_id > 0:
63-
log(recid, "WARNING", f"Skipping because of mcdb_id value {mcdb_id}")
64-
return
71+
72+
if mcdb_id > 1:
73+
74+
parent_dir = f'{gen_store}/mcdb'
75+
76+
'''
77+
Make dir if it doesn't already exist
78+
'''
79+
if not os.path.isdir(parent_dir):
80+
os.makedirs(parent_dir)
81+
82+
filepath = f'{gen_store}/mcdb/{mcdb_id}'
83+
84+
'''
85+
If the header file already exists and is not empty
86+
we are already done so return.
87+
'''
88+
if os.path.exists(filepath + "_header.txt") and get_file_size(f'{filepath}_header.txt') > 1024:
89+
print(
90+
f'{mcdb_id} mcdb id exists, skipping'
91+
)
92+
return
93+
94+
#We only want .xz or .lhe extensions.
95+
files = [
96+
f for f in os.listdir(f'/eos/cms/store/lhe/{mcdb_id}') if os.path.isfile(os.path.join(f'/eos/cms/store/lhe/{mcdb_id}', f)) and
97+
(os.path.splitext(f)[1] == '.xz' or os.path.splitext(f)[1] == '.lhe')
98+
]
99+
100+
101+
#If we have no files then return.
102+
if len(files) == 0:
103+
print(
104+
f'<exterr>\n {mcdb_id}: no files found with .xz or .lhe extension\n </exterr>',
105+
file=sys.stderr
106+
)
107+
return
108+
109+
#If there are multiple files with either .xz or .lhe
110+
#extensions then take the first one anyway.
65111

112+
#TODO: If multiple files then use _0? Is it certain that it exists?
113+
#See https://github.com/cernopendata/data-curation/issues/97
114+
(_, ext) = os.path.splitext(files[0])
115+
116+
generators = get_from_deep_json(
117+
get_mcm_dict(dataset, mcm_dir), "generators") or 0
118+
119+
cmds = [
120+
f"xz -d -c /eos/cms/store/lhe/{mcdb_id}/* > {filepath} " if ext == '.xz'
121+
else f"cp /eos/cms/store/lhe/{mcdb_id}/* {filepath} ",
122+
f"awk '/<!--/,/-->/' {filepath} > {filepath}_header.txt" if generators == ["MCFM701"]
123+
else f"awk '/<header>/,/<\/header>/' {filepath} > {filepath}_header.txt"
124+
]
125+
126+
if cmd_run(cmds, dataset):
127+
size = get_file_size(f'{filepath}_header.txt')
128+
129+
if size <= 1024:
130+
131+
#If empty, take comments (assume it is a MCFM701)
132+
cmd_run(
133+
[
134+
f"awk '/<!--/,/-->/' {filepath} > {filepath}_header.txt; \
135+
awk '/<init>/,/<\/init>/' {filepath} >> {filepath}_header.txt;"
136+
],
137+
dataset
138+
)
139+
140+
size = get_file_size(
141+
f'{filepath}_header.txt'
142+
)
143+
144+
if size <= 1024:
145+
print(
146+
f'<size>\n[Warning] in {dataset}\n mcdb_id: {mcdb_id} \n==>\t Header file size is only {size} Bytes\n</size>',
147+
file=sys.stderr
148+
)
149+
150+
#If we got the _header.txt file then, there is no need to keep original files
151+
cmd_run(
152+
[
153+
f'rm -rf {filepath}'
154+
],
155+
dataset
156+
)
157+
66158
# Find fragment
67159
fragment_url = get_genfragment_url(dataset, mcm_dir)
68160
if fragment_url:
@@ -227,12 +319,14 @@ def create_lhe_generator(
227319
dataset_nanoaod = [
228320
name[:-1] for name in dataset_full_names if name[:-1].endswith("NANOAODSIM")
229321
]
322+
230323
i = 1
231324
l = len(dataset_nanoaod)
325+
232326
for dataset in dataset_nanoaod:
233327
recid = RECID_INFO[dataset]
234328

235-
print(f"Getting LHE {i}/{l}")
329+
#print(f"Getting LHE {i}/{l}")
236330
log(recid, "INFO", f"Getting LHE {i}/{l}")
237331
log(recid, "INFO", f"Found record ID {recid}")
238332
log(recid, "INFO", f"Found dataset {dataset}")

0 commit comments

Comments
 (0)