add support for mcdb and cleanup code

tpmccauley · tpmccauley · commit b067d71339c8 · 2024-05-23T17:49:52.000+02:00
diff --git a/cms-2016-simulated-datasets/code/lhe_generators.py b/cms-2016-simulated-datasets/code/lhe_generators.py
@@ -20,7 +20,8 @@
 
 def log(recid, logtype, logmessage):
     """Store a log message of a certain type to record-ID-based log file system."""
-    logdir = f"./lhe_generators/2016-sim/gridpacks/{recid}"
+    logdir = f'./lhe_generators/2016-sim/gridpacks/{recid}'
+    
     if not os.path.exists(logdir):
         os.makedirs(logdir)
     with open(f"{logdir}/LOG.txt", "a") as fdesc:
@@ -44,25 +45,116 @@ def get_lhe(dataset, mcm_dir):
 
 
 def cmd_run(cmds, recid):
+
     for cmd in cmds:
         err = subprocess.run(
-            cmd, shell=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE
+            cmd,
+            shell=True,
+            stderr=subprocess.PIPE,
+            stdout=subprocess.PIPE
         ).stderr.decode()
+        
         if err:
             log(recid, "ERROR", f"Error {err}")
             return False
-    return True
 
+    return True
 
+    
 def create_lhe_generator(
     dataset, recid, mcm_dir, gen_store="./lhe_generators/2016-sim"
 ):
-    # mcm_dir is the directory of the LHE step
+    '''
+    mcm_dir is the directory of the LHE step
+    '''
     mcdb_id = get_from_deep_json(get_mcm_dict(dataset, mcm_dir), "mcdb_id") or 0
-    if mcdb_id > 0:
-        log(recid, "WARNING", f"Skipping because of mcdb_id value {mcdb_id}")
-        return
+    
+    if mcdb_id > 1:
+
+        parent_dir = f'{gen_store}/mcdb'
+        
+        '''
+        Make dir if it doesn't already exist
+        '''
+        if not os.path.isdir(parent_dir):
+            os.makedirs(parent_dir)
+
+        filepath = f'{gen_store}/mcdb/{mcdb_id}'
+
+        '''
+        If the header file already exists and is not empty
+        we are already done so return.
+        '''
+        if os.path.exists(filepath + "_header.txt") and get_file_size(f'{filepath}_header.txt') > 1024:
+            print(
+                f'{mcdb_id} mcdb id exists, skipping'
+            )
+            return
+    
+        #We only want .xz or .lhe extensions.
+        files = [
+            f for f in os.listdir(f'/eos/cms/store/lhe/{mcdb_id}') if os.path.isfile(os.path.join(f'/eos/cms/store/lhe/{mcdb_id}', f)) and
+            (os.path.splitext(f)[1] == '.xz' or os.path.splitext(f)[1] == '.lhe') 
+        ]
+        
+        
+        #If we have no files then return.
+        if len(files) == 0:
+            print(
+                f'<exterr>\n {mcdb_id}: no files found with .xz or .lhe extension\n </exterr>',
+                file=sys.stderr
+            )
+            return
+
+        #If there are multiple files with either .xz or .lhe
+        #extensions then take the first one anyway.
 
+        #TODO: If multiple files then use _0? Is it certain that it exists?
+        #See https://github.com/cernopendata/data-curation/issues/97
+        (_, ext) = os.path.splitext(files[0])
+
+        generators = get_from_deep_json(
+            get_mcm_dict(dataset, mcm_dir), "generators") or 0
+
+        cmds = [
+            f"xz -d -c /eos/cms/store/lhe/{mcdb_id}/* > {filepath} " if ext == '.xz'
+            else f"cp /eos/cms/store/lhe/{mcdb_id}/*  {filepath} ",
+            f"awk '/<!--/,/-->/' {filepath} > {filepath}_header.txt" if generators == ["MCFM701"]
+            else f"awk '/<header>/,/<\/header>/' {filepath} > {filepath}_header.txt"
+        ]
+        
+        if cmd_run(cmds, dataset):
+            size = get_file_size(f'{filepath}_header.txt')
+
+            if size <= 1024:
+                
+                #If empty, take comments (assume it is a MCFM701)
+                cmd_run(
+                    [
+                        f"awk '/<!--/,/-->/' {filepath} > {filepath}_header.txt; \
+                           awk '/<init>/,/<\/init>/' {filepath} >> {filepath}_header.txt;"
+                    ],
+                    dataset
+                )
+                
+                size = get_file_size(
+                    f'{filepath}_header.txt'
+                )
+
+                if size <= 1024:
+                    print(
+                        f'<size>\n[Warning] in {dataset}\n mcdb_id: {mcdb_id} \n==>\t Header file size is only {size} Bytes\n</size>',
+                        file=sys.stderr
+                    )
+
+            #If we got the _header.txt file then, there is no need to keep original files
+            cmd_run(
+                [
+                    f'rm -rf {filepath}'
+                ],
+                dataset
+            )    
+            
     # Find fragment
     fragment_url = get_genfragment_url(dataset, mcm_dir)
     if fragment_url:
@@ -227,12 +319,14 @@ def create_lhe_generator(
 dataset_nanoaod = [
     name[:-1] for name in dataset_full_names if name[:-1].endswith("NANOAODSIM")
 ]
+
 i = 1
 l = len(dataset_nanoaod)
+
 for dataset in dataset_nanoaod:
     recid = RECID_INFO[dataset]
 
-    print(f"Getting LHE {i}/{l}")
+    #print(f"Getting LHE {i}/{l}")
     log(recid, "INFO", f"Getting LHE {i}/{l}")
     log(recid, "INFO", f"Found record ID {recid}")
     log(recid, "INFO", f"Found dataset {dataset}")