Skip to content

Commit 38af7bf

Browse files
committed
WIP
1 parent 0ef6455 commit 38af7bf

File tree

6 files changed

+86435
-3256
lines changed

6 files changed

+86435
-3256
lines changed

cms-2016-simulated-datasets/README.md

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ Warning: Creating the full local cache might take a long time.
4343
First step is to create EOS file index cache:
4444

4545
```console
46-
$ time python3 ./code/interface.py --create-eos-indexes inputs/CMS-2016-mc-datasets.txt
46+
$ python3 ./code/interface.py --create-eos-indexes inputs/CMS-2016-mc-datasets.txt
4747
```
4848

4949
This requires the data files to be placed in their final location. However, for
@@ -53,17 +53,17 @@ by means of adding the command-line option `--ignore-eos-store` to the commands
5353
We can now build sample records by doing:
5454

5555
```console
56-
$ time python3 ./code/interface.py --create-das-json-store --ignore-eos-store inputs/CMS-2016-mc-datasets.txt
56+
$ python3 ./code/interface.py --create-das-json-store --ignore-eos-store inputs/CMS-2016-mc-datasets.txt
5757

5858
$ auth-get-sso-cookie -u https://cms-pdmv.cern.ch/mcm -o cookies.txt
59-
$ time python3 ./code/interface.py --create-mcm-store --ignore-eos-store inputs/CMS-2016-mc-datasets.txt
59+
$ python3 ./code/interface.py --create-mcm-store --ignore-eos-store inputs/CMS-2016-mc-datasets.txt
6060

61-
$ time python3 ./code/interface.py --get-conf-files --ignore-eos-store inputs/CMS-2016-mc-datasets.txt
61+
$ python3 ./code/interface.py --get-conf-files --ignore-eos-store inputs/CMS-2016-mc-datasets.txt
6262

63-
$ time python3 code/lhe_generators.py
63+
$ python3 code/lhe_generators.py
6464

65-
$ time python3 ./code/interface.py --create-records --ignore-eos-store inputs/CMS-2016-mc-datasets.txt
66-
$ time python3 ./code/interface.py --create-conffiles-records --ignore-eos-store inputs/CMS-2016-mc-datasets.txt
65+
$ python3 ./code/interface.py --create-records --ignore-eos-store inputs/CMS-2016-mc-datasets.txt
66+
$ python3 ./code/interface.py --create-conffiles-records --ignore-eos-store inputs/CMS-2016-mc-datasets.txt
6767
```
6868

6969
Note that to build the test records an (empty) input file for DOIs and a recid
@@ -80,7 +80,7 @@ The output JSON files for the dataset records will be generated in the
8080

8181

8282
```console
83-
python3 code/lhe_generators.py 2> errors > output &
83+
$ python3 code/lhe_generators.py >& output
8484
```
8585

8686
- This will get lhe generator parameters from gridpacks for datasets listed in `./inputs/CMS-2016-mc-datasets.txt`.

cms-2016-simulated-datasets/code/config_store.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,14 @@ def get_conffile_ids_all_chain_steps(dataset, mcm_dir):
1111
"""Return location of the configuration files for the dataset."""
1212
ids = {}
1313
path = mcm_dir + '/chain/' + dataset.replace('/', '@')
14-
step_dirs = os.listdir(path)
14+
try:
15+
step_dirs = os.listdir(path)
16+
except FileNotFoundError:
17+
return []
1518
for step in step_dirs:
1619
step_dir = path + '/' + step
1720
mcm_config_ids = get_conffile_ids_from_mcm(dataset, step_dir)
18-
21+
1922
for someid in mcm_config_ids:
2023
ids[someid] = 1
2124

cms-2016-simulated-datasets/code/lhe_generators.py

Lines changed: 205 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -1,126 +1,247 @@
1+
#!/usr/bin/env python3
2+
3+
import datetime
4+
import fnmatch
5+
import os
6+
import re
7+
import requests
8+
import subprocess
9+
import urllib3
10+
111
from dataset_records import *
2-
from os import listdir
3-
from os.path import isfile, join
4-
from requests.packages.urllib3.exceptions import InsecureRequestWarning
12+
from mcm_store import get_mcm_dict
13+
from utils import get_from_deep_json
14+
15+
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
16+
17+
RECID_INFO = {}
18+
exec(open("inputs/recid_info.py", "r").read()) # import RECID_INFO
519

620

7-
exec(open('inputs/recid_info.py', 'r').read()) # import RECID_INFO
8-
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
21+
def log(recid, logtype, logmessage):
22+
"""Store a log message of a certain type to record-ID-based log file system."""
23+
logdir = f"./lhe_generators/2016-sim/gridpacks/{recid}"
24+
if not os.path.exists(logdir):
25+
os.makedirs(logdir)
26+
with open(f"{logdir}/LOG.txt", "a") as fdesc:
27+
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
28+
fdesc.write(f"{now} | {logtype} | {logmessage}\n")
29+
930

10-
# get LHE Parent or False
1131
def get_lhe(dataset, mcm_dir):
12-
path = mcm_dir + '/chain/' + dataset.replace('/', '@')
32+
"""Get LHE Parent or False"""
33+
path = mcm_dir + "/chain/" + dataset.replace("/", "@")
1334
step_dirs = os.listdir(path)
1435
for step in step_dirs:
15-
step_dir = path + '/' + step
16-
datatier = get_from_deep_json(get_mcm_dict(dataset,step_dir),'datatier')
17-
if "LHE" in datatier:
36+
step_dir = path + "/" + step
37+
datatier = get_from_deep_json(get_mcm_dict(dataset, step_dir), "datatier")
38+
if datatier and "LHE" in datatier:
1839
return step_dir
19-
2040
return False
2141

2242

23-
def cmd_run(cmds, dataset):
43+
def cmd_run(cmds, recid):
2444
for cmd in cmds:
25-
err = subprocess.run(cmd, shell=True, stderr=subprocess.PIPE,
26-
stdout=subprocess.PIPE).stderr.decode()
45+
err = subprocess.run(
46+
cmd, shell=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE
47+
).stderr.decode()
2748
if err:
28-
print("<pserr>\n[Error] in " + dataset + "\n==>\t" +
29-
err + "<==\n</pserr>", file=sys.stderr)
49+
log(recid, "ERROR", f"Error {err}")
3050
return False
3151
return True
3252

3353

34-
def create_lhe_generator(dataset, recid, mcm_dir, gen_store='./lhe_generators/2016-sim'):
35-
# mcm_dir is the directory of the LHE step
36-
fragment_url = get_genfragment_url(dataset, mcm_dir)
37-
if fragment_url:
38-
fragment_url = fragment_url[0]
39-
fragment = requests.get(fragment_url, verify=False).text
40-
if not fragment:
41-
fragment = get_from_deep_json(
42-
get_mcm_dict(dataset, mcm_dir), "fragment")
43-
else:
44-
fragment = get_from_deep_json(
45-
get_mcm_dict(dataset, mcm_dir), "fragment")
54+
def create_lhe_generator(
55+
dataset, recid, mcm_dir, gen_store="./lhe_generators/2016-sim"
56+
):
57+
# mcm_dir is the directory of the LHE step
58+
mcdb_id = get_from_deep_json(get_mcm_dict(dataset, mcm_dir), "mcdb_id") or 0
59+
if mcdb_id > 0:
60+
log(recid, "WARNING", f"Skipping because of mcdb_id value {mcdb_id}")
61+
return
62+
63+
# Find fragment
64+
fragment_url = get_genfragment_url(dataset, mcm_dir)
65+
if fragment_url:
66+
fragment_url = fragment_url[0]
67+
fragment = requests.get(fragment_url, verify=False).text
4668
if not fragment:
47-
print("<emp>\n[Error] in" + dataset +
48-
"\n==>\t No fragment URL and Empty fragment in mcm dict, Skipping\n</emp>", file=sys.stderr)
49-
return
50-
51-
path = re.search(r"cms.vstring\('(.*?)'", fragment)
52-
53-
if not path:
54-
print("<vstring>\n[Warning] in" + dataset +
55-
"\n==>\t 'cms.vstring' not found in fragment , Skipping\n</vstring>", file=sys.stderr)
56-
return
57-
path = path.group(1)
58-
# print("found path: " + str(path) )
59-
outfilepath = "{gen_store}/gridpacks/{recid}".format(
60-
gen_store=gen_store, recid=recid)
61-
62-
if os.path.exists(outfilepath) and len(os.listdir(outfilepath)) != 0:
63-
print(str(recid) + ' recid gridpack Exist, Skipping')
64-
return
65-
66-
if 'amcatnlo' in path or 'amcatnlo' in dataset:
67-
print(dataset + '\n' + str(recid) +
68-
"amcatnlo gridpack!!! path:" + path)
69-
files = [
70-
'process/Cards/run_card.dat',
71-
'process/Cards/proc_card*.dat',
72-
'process/Cards/param_card.dat',
69+
fragment = get_from_deep_json(get_mcm_dict(dataset, mcm_dir), "fragment")
70+
else:
71+
fragment = get_from_deep_json(get_mcm_dict(dataset, mcm_dir), "fragment")
72+
if not fragment:
73+
log(
74+
recid,
75+
"ERROR",
76+
f"No fragment URL and Empty fragment in mcm dict; skipping.",
77+
)
78+
return
79+
80+
# Find gridpack path
81+
path = re.search(r"cms.vstring\(['\"](/cvmfs.*?)['\"]", fragment)
82+
if not path:
83+
log(
84+
recid,
85+
"ERROR",
86+
f"No 'cms.vstring(['\"]/cvmfs' found in fragment; skipping.",
87+
)
88+
return
89+
90+
path = path.group(1)
91+
log(recid, "INFO", f"Found path {path}")
92+
outfilepath = "{gen_store}/gridpacks/{recid}".format(
93+
gen_store=gen_store, recid=recid
94+
)
95+
if os.path.exists(outfilepath) and len(os.listdir(outfilepath)) > 1:
96+
log(
97+
recid,
98+
"WARNING",
99+
f"Gridpack seems to exist for this record ID already. Skipping.",
100+
)
101+
return
102+
103+
# Identify gridpack case
104+
gridpack_case = "UNKNOWN"
105+
path_lower = path.lower()
106+
path_lower_position = {}
107+
for acase in ["amcatnlo", "madgraph", "powheg", "jhugen", "phantom", "mcfm"]:
108+
path_lower_position[acase] = path_lower.find(acase)
109+
found = 1e10
110+
for key, val in path_lower_position.items():
111+
if val > 0 and val < found:
112+
gridpack_case = key
113+
if gridpack_case == "UNKNOWN":
114+
log(recid, "ERROR", f"Found case {gridpack_case}")
115+
else:
116+
log(recid, "INFO", f"Found case {gridpack_case}")
117+
118+
# List content if all files in gridpack tarball
119+
files_all = []
120+
res = subprocess.check_output(f"tar tf {path}", shell=True)
121+
for line in res.splitlines():
122+
files_all.append(line.decode())
123+
124+
# Select interesting files based on gridpack case
125+
files = [
126+
"./InputCards/*.dat",
127+
"./runcmsgrid.sh",
128+
"InputCards/*.dat",
129+
"runcmsgrid.sh",
130+
]
131+
if gridpack_case == "amcatnlo":
132+
files.extend(
133+
[
134+
"./process/Cards/param_card.dat",
135+
"./process/Cards/proc_card*.dat",
136+
"./process/Cards/run_card.dat",
137+
"process/Cards/param_card.dat",
138+
"process/Cards/proc_card*.dat",
139+
"process/Cards/run_card.dat",
73140
]
74-
mv_cmd = "mv process/Cards/*dat .; rmdir -p process/Cards"
75-
elif 'madgraph' in path:
76-
files = [
77-
'process/madevent/Cards/run_card.dat',
78-
'process/madevent/Cards/proc_card*.dat',
79-
'process/madevent/Cards/param_card.dat',
141+
)
142+
elif gridpack_case == "madgraph":
143+
files.extend(
144+
[
145+
"./process/madevent/Cards/param_card.dat",
146+
"./process/madevent/Cards/proc_card*.dat",
147+
"./process/madevent/Cards/run_card.dat",
148+
"process/madevent/Cards/param_card.dat",
149+
"process/madevent/Cards/proc_card*.dat",
150+
"process/madevent/Cards/run_card.dat",
80151
]
81-
mv_cmd = "mv process/madevent/Cards/*dat .; rmdir -p process/madevent/Cards"
82-
elif 'powheg' in path:
83-
files = [
84-
'*.input',
152+
)
153+
elif gridpack_case == "powheg":
154+
files.extend(
155+
[
156+
"*.input",
85157
]
86-
mv_cmd = ""
87-
else:
88-
print("<path>\n[Error] Unknown path:('" + path +
89-
"')\nDataset: " + dataset + '\n</path>', file=sys.stderr)
90-
return
91-
92-
files = "'" + "' '".join(files) + "'"
158+
)
159+
elif gridpack_case == "jhugen":
160+
files.extend(
161+
[
162+
"./jhugen.input",
163+
"./jhugen_decay.input",
164+
"jhugen.input",
165+
"jhugen_decay.input",
166+
]
167+
)
168+
elif gridpack_case == "phantom":
169+
files.extend(
170+
[
171+
"./r_GEN.in",
172+
"r_GEN.in",
173+
]
174+
)
175+
elif gridpack_case == "mcfm":
176+
files.extend(
177+
[
178+
"./readInput.DAT",
179+
"readInput.DAT",
180+
]
181+
)
182+
183+
# Select only those files that are present
184+
files_selected = []
185+
for afile in files:
186+
files_selected.extend(fnmatch.filter(files_all, afile))
187+
188+
# Warn if there was no runcmsgrid or InputCards found for some cases
189+
if gridpack_case in ("amcatnlo", "madgraph"):
190+
if not "InputCards" in " ".join(files_selected):
191+
log(recid, "ERROR", f"InputCards not present in the tarball.")
192+
if not "runcmsgrid.sh" in " ".join(files_selected):
193+
log(recid, "ERROR", f"runcmsgrid.sh not present in the tarball.")
194+
195+
# Warn if no interesting files were found at all
196+
if len(files_selected) == 0:
197+
log(recid, "ERROR", "Found no interesting files at all.")
198+
else:
199+
# Inform about which files are going to be extracted
200+
log(
201+
recid,
202+
"INFO",
203+
f"Found the following interesting files: {' '.join(files_selected)}",
204+
)
205+
# Prepare the tarball extraction command
93206
cmds = [
94-
"mkdir -p {out}; cd {out};\
95-
tar -xf {path} {files} -C {out}; {mv}".format(out=outfilepath, path=path, files=files, mv=mv_cmd)
207+
f"mkdir -p {outfilepath}; cd {outfilepath}; tar -xf {path} {' '.join(files_selected)} -C {outfilepath}"
96208
]
97-
# print("Prepared commands: " + str(cmds))
98-
cmd_run(cmds, dataset)
209+
log(recid, "INFO", f"Executing commands {cmds}")
210+
# Run the tarball extraction command
211+
cmd_run(cmds, recid)
212+
213+
# Print full content of gridpack tarball for debugging purposes
214+
log(recid, "DEBUG", f"Full gridpack tarball content is:")
215+
for afile in files_all:
216+
log(recid, "DEBUG", f"- {afile}")
99217

100218

101219
das_dir = "./inputs/das-json-store"
102220
mcm_dir = "./inputs/mcm-store"
103-
with open("./inputs/CMS-2016-mc-datasets.txt", 'r') as file:
221+
with open("./inputs/CMS-2016-mc-datasets.txt", "r") as file:
104222
dataset_full_names = file.readlines()
105223

106-
dataset_nanoaod = [name[:-1] for name in dataset_full_names if name[:-1].endswith('NANOAODSIM')]
224+
dataset_nanoaod = [
225+
name[:-1] for name in dataset_full_names if name[:-1].endswith("NANOAODSIM")
226+
]
107227
i = 1
108228
l = len(dataset_nanoaod)
109229
for dataset in dataset_nanoaod:
230+
recid = RECID_INFO[dataset]
110231

111-
#dataset = dataset[:-1]
232+
print(f"Getting LHE {i}/{l}")
233+
log(recid, "INFO", f"Getting LHE {i}/{l}")
234+
log(recid, "INFO", f"Found record ID {recid}")
235+
log(recid, "INFO", f"Found dataset {dataset}")
112236

113237
lhe_dir = get_lhe(dataset, mcm_dir)
114238
if not lhe_dir:
239+
log(recid, "WARNING", f"There is no LHE directory. Skipping.")
115240
continue
116241

117-
recid = RECID_INFO[dataset]
118-
119-
print("Getting ({i}/{l}): {ds}".format(
120-
i=i, l=l, ds=lhe_dir or 'No LHE parent for this record'))
242+
log(recid, "INFO", f"Found LHE directory {lhe_dir}")
121243

122-
t = threading.Thread(target=create_lhe_generator,
123-
args=(dataset, recid, lhe_dir))
244+
t = threading.Thread(target=create_lhe_generator, args=(dataset, recid, lhe_dir))
124245
t.start()
125246
i += 1
126247
while threading.activeCount() >= 20:

0 commit comments

Comments
 (0)