20
20
21
21
def log (recid , logtype , logmessage ):
22
22
"""Store a log message of a certain type to record-ID-based log file system."""
23
- logdir = f"./lhe_generators/2016-sim/gridpacks/{ recid } "
23
+ logdir = f'./lhe_generators/2016-sim/gridpacks/{ recid } '
24
+
24
25
if not os .path .exists (logdir ):
25
26
os .makedirs (logdir )
26
27
with open (f"{ logdir } /LOG.txt" , "a" ) as fdesc :
@@ -44,25 +45,116 @@ def get_lhe(dataset, mcm_dir):
44
45
45
46
46
47
def cmd_run (cmds , recid ):
48
+
47
49
for cmd in cmds :
48
50
err = subprocess .run (
49
- cmd , shell = True , stderr = subprocess .PIPE , stdout = subprocess .PIPE
51
+ cmd ,
52
+ shell = True ,
53
+ stderr = subprocess .PIPE ,
54
+ stdout = subprocess .PIPE
50
55
).stderr .decode ()
56
+
51
57
if err :
52
58
log (recid , "ERROR" , f"Error { err } " )
53
59
return False
54
- return True
55
60
61
+ return True
56
62
63
+
57
64
def create_lhe_generator (
58
65
dataset , recid , mcm_dir , gen_store = "./lhe_generators/2016-sim"
59
66
):
60
- # mcm_dir is the directory of the LHE step
67
+ '''
68
+ mcm_dir is the directory of the LHE step
69
+ '''
61
70
mcdb_id = get_from_deep_json (get_mcm_dict (dataset , mcm_dir ), "mcdb_id" ) or 0
62
- if mcdb_id > 0 :
63
- log (recid , "WARNING" , f"Skipping because of mcdb_id value { mcdb_id } " )
64
- return
71
+
72
+ if mcdb_id > 1 :
73
+
74
+ parent_dir = f'{ gen_store } /mcdb'
75
+
76
+ '''
77
+ Make dir if it doesn't already exist
78
+ '''
79
+ if not os .path .isdir (parent_dir ):
80
+ os .makedirs (parent_dir )
81
+
82
+ filepath = f'{ gen_store } /mcdb/{ mcdb_id } '
83
+
84
+ '''
85
+ If the header file already exists and is not empty
86
+ we are already done so return.
87
+ '''
88
+ if os .path .exists (filepath + "_header.txt" ) and get_file_size (f'{ filepath } _header.txt' ) > 1024 :
89
+ print (
90
+ f'{ mcdb_id } mcdb id exists, skipping'
91
+ )
92
+ return
93
+
94
+ #We only want .xz or .lhe extensions.
95
+ files = [
96
+ f for f in os .listdir (f'/eos/cms/store/lhe/{ mcdb_id } ' ) if os .path .isfile (os .path .join (f'/eos/cms/store/lhe/{ mcdb_id } ' , f )) and
97
+ (os .path .splitext (f )[1 ] == '.xz' or os .path .splitext (f )[1 ] == '.lhe' )
98
+ ]
99
+
100
+
101
+ #If we have no files then return.
102
+ if len (files ) == 0 :
103
+ print (
104
+ f'<exterr>\n { mcdb_id } : no files found with .xz or .lhe extension\n </exterr>' ,
105
+ file = sys .stderr
106
+ )
107
+ return
108
+
109
+ #If there are multiple files with either .xz or .lhe
110
+ #extensions then take the first one anyway.
65
111
112
+ #TODO: If multiple files then use _0? Is it certain that it exists?
113
+ #See https://github.com/cernopendata/data-curation/issues/97
114
+ (_ , ext ) = os .path .splitext (files [0 ])
115
+
116
+ generators = get_from_deep_json (
117
+ get_mcm_dict (dataset , mcm_dir ), "generators" ) or 0
118
+
119
+ cmds = [
120
+ f"xz -d -c /eos/cms/store/lhe/{ mcdb_id } /* > { filepath } " if ext == '.xz'
121
+ else f"cp /eos/cms/store/lhe/{ mcdb_id } /* { filepath } " ,
122
+ f"awk '/<!--/,/-->/' { filepath } > { filepath } _header.txt" if generators == ["MCFM701" ]
123
+ else f"awk '/<header>/,/<\/header>/' { filepath } > { filepath } _header.txt"
124
+ ]
125
+
126
+ if cmd_run (cmds , dataset ):
127
+ size = get_file_size (f'{ filepath } _header.txt' )
128
+
129
+ if size <= 1024 :
130
+
131
+ #If empty, take comments (assume it is a MCFM701)
132
+ cmd_run (
133
+ [
134
+ f"awk '/<!--/,/-->/' { filepath } > { filepath } _header.txt; \
135
+ awk '/<init>/,/<\/init>/' { filepath } >> { filepath } _header.txt;"
136
+ ],
137
+ dataset
138
+ )
139
+
140
+ size = get_file_size (
141
+ f'{ filepath } _header.txt'
142
+ )
143
+
144
+ if size <= 1024 :
145
+ print (
146
+ f'<size>\n [Warning] in { dataset } \n mcdb_id: { mcdb_id } \n ==>\t Header file size is only { size } Bytes\n </size>' ,
147
+ file = sys .stderr
148
+ )
149
+
150
+ #If we got the _header.txt file then, there is no need to keep original files
151
+ cmd_run (
152
+ [
153
+ f'rm -rf { filepath } '
154
+ ],
155
+ dataset
156
+ )
157
+
66
158
# Find fragment
67
159
fragment_url = get_genfragment_url (dataset , mcm_dir )
68
160
if fragment_url :
@@ -227,12 +319,14 @@ def create_lhe_generator(
227
319
dataset_nanoaod = [
228
320
name [:- 1 ] for name in dataset_full_names if name [:- 1 ].endswith ("NANOAODSIM" )
229
321
]
322
+
230
323
i = 1
231
324
l = len (dataset_nanoaod )
325
+
232
326
for dataset in dataset_nanoaod :
233
327
recid = RECID_INFO [dataset ]
234
328
235
- print (f"Getting LHE { i } /{ l } " )
329
+ # print(f"Getting LHE {i}/{l}")
236
330
log (recid , "INFO" , f"Getting LHE { i } /{ l } " )
237
331
log (recid , "INFO" , f"Found record ID { recid } " )
238
332
log (recid , "INFO" , f"Found dataset { dataset } " )
0 commit comments