Merge pull request #69 from fact-project/add_local_option_to_runlist

jebuss · web-flow · commit f5cee3d4db09 · 2018-05-23T15:26:21.000+02:00
Add local_output option to process scripts
diff --git a/erna/scripts/process_fact_data.py b/erna/scripts/process_fact_data.py
@@ -9,21 +9,52 @@
 from gridmap import Job
 
 import erna
-from erna import stream_runner
+from erna.utils import create_filename_from_format
+from erna import stream_runner as stream_runner_std
+from erna import stream_runner_local_output as stream_runner_local
+
 import erna.datacheck_conditions as dcc
 
 logger = logging.getLogger(__name__)
 
 
-def make_jobs(jar, xml, aux_source_path, output_directory, df_mapping,  engine, queue, vmem, num_runs_per_bunch, walltime):
+def make_jobs(jar, xml, aux_source_path, output_directory, df_mapping,  engine, queue, vmem, num_runs_per_bunch, walltime, output_path=None, filename_format="{basename}_{num}.json"):
     jobs = []
+
+    if output_path:
+        logger.info("Using stream runner for local output")
+    else:
+        logger.debug("Using std stream runner gathering output from all nodes")
+
     # create job objects
+
     df_mapping["bunch_index"]= np.arange(len(df_mapping)) // num_runs_per_bunch
     for num, df in df_mapping.groupby("bunch_index"):
         df=df.copy()
         df["bunch_index"] = num
-        job = Job(stream_runner.run, [jar, xml, df, aux_source_path], queue=queue, walltime=walltime, engine=engine, mem_free='{}mb'.format(vmem))
-        jobs.append(job)
+
+        if output_path:
+            # create the filenames for each single local run
+            file_name, _ = path.splitext(path.basename(output_path))
+            file_name = create_filename_from_format(filename_format, file_name, num)
+            out_path = path.dirname(output_path)
+            run = [jar, xml, df, path.join(out_path, file_name), aux_source_path]
+            stream_runner = stream_runner_local
+        else:
+            run = [jar, xml, df, aux_source_path]
+            stream_runner = stream_runner_std
+
+        jobs.append(
+           Job(stream_runner.run,
+               run,
+               queue=queue,
+               walltime=walltime,
+               engine=engine,
+               mem_free='{}mb'.format(vmem)
+               )
+           )
+        avg_num_files = np.mean([len(part) for num, part in df_mapping.groupby("bunch_index")])
+        logger.info("Created {} jobs with {} files each.".format(len(jobs), avg_num_files))
 
     return jobs
 
@@ -47,9 +78,20 @@ def make_jobs(jar, xml, aux_source_path, output_directory, df_mapping,  engine,
 @click.option('--conditions',  help='Name of the data conditions as given in datacheck_conditions.py e.g standard', default='standard')
 @click.option('--max_delta_t', default=30,  help='Maximum time difference (minutes) allowed between drs and data files.', type=click.INT)
 @click.option('--local', default=False,is_flag=True,   help='Flag indicating whether jobs should be executed localy .')
+@click.option('--local_output', default=False, is_flag=True,
+              help='Flag indicating whether jobs write their output localy'
+              + 'to disk without gathering everything in the mother'
+              + 'process. In this case the output file only contains a'
+              + 'summary oth the processed jobs. The data ouput will be'
+              + 'in separate files',
+              show_default=True)
+@click.option('--local_output_format', default="{basename}_{num}.json", help="Give the file format for the local output funktionality."
+              + "%b will replace the out filename and %[1-9]n the given local number."
+              + "Default is: '{basename}_{num}.json'.Only works with option --local_output. ")
+@click.password_option(help='password to read from the always awesome RunDB')
 @click.option('--yes', help="Assume 'yes'if your asked to continue processing and start jobs", default=False, is_flag=True)
 @click.password_option(help='password to read from the always awesome RunDB')
-def main(earliest_night, latest_night, data_dir, jar, xml, aux_source, out, queue, walltime, engine, num_runs, vmem, log_level, port, source, conditions, max_delta_t, local, yes, password):
+def main(earliest_night, latest_night, data_dir, jar, xml, aux_source, out, queue, walltime, engine, num_runs, vmem, log_level, port, source, conditions, max_delta_t, local, local_output, local_output_format, yes, password):
 
     level=logging.INFO
     if log_level is 'DEBUG':
@@ -74,19 +116,31 @@ def main(earliest_night, latest_night, data_dir, jar, xml, aux_source, out, queu
     factdb = sqlalchemy.create_engine("mysql+pymysql://factread:{}@129.194.168.95/factdata".format(password))
     data_conditions=dcc.conditions[conditions]
     df_runs = erna.load(earliest_night, latest_night, data_dir, source_name=source, timedelta_in_minutes=max_delta_t, factdb=factdb, data_conditions=data_conditions)
-    
+
     # check for missing data and fix possible wrong file extension (.fz->.gz)
     df = erna.test_data_path(df_runs, "data_path")
+
     df_runs = df[df['data_file_exists']]
     df_runs_missing = df[np.logical_not(df['data_file_exists'])]
-    
-    logger.warn("Missing {} dataruns due to missing datafiles".format(len(df_runs_missing)))
 
+    logger.warn("Missing {} dataruns due to missing datafiles".format(len(df_runs_missing)))
     logger.info("Would process {} jobs with {} runs per job".format(len(df_runs)//num_runs, num_runs))
     if not yes:
         click.confirm('Do you want to continue processing and start jobs?', abort=True)
 
-    job_list = make_jobs(jarpath, xmlpath, aux_source_path, output_directory, df_runs,  engine, queue, vmem, num_runs, walltime)
+    if local_output:
+        job_list = make_jobs(jarpath, xmlpath, aux_source_path,
+                             output_directory, df_runs, engine, queue,
+                             vmem, num_runs,  walltime,
+                             output_path=local_output_dir,
+                             filename_format=local_output_format
+                             )
+    else:
+        job_list = make_jobs(jarpath, xmlpath, aux_source_path,
+                             output_directory, df_runs, engine, queue,
+                             vmem, num_runs,  walltime
+                             )
+
     job_outputs = gridmap.process_jobs(job_list, max_processes=len(job_list), local=local)
     erna.collect_output(job_outputs, out, df_runs)
 
diff --git a/erna/scripts/process_fact_mc.py b/erna/scripts/process_fact_mc.py
@@ -5,6 +5,7 @@
 from os import path
 
 import erna
+from erna.utils import create_filename_from_format
 from erna import stream_runner as stream_runner_std
 from erna import stream_runner_local_output as stream_runner_local
 
@@ -15,33 +16,22 @@
 
 logger = logging.getLogger(__name__)
 
-import re
-
-def create_filename_from_format(filename_format, basename, num):
-    """
-    Given a special format string, create a filename_format with the basename and a given number.
-    There are two named variables that can be used, one is basename which inserts the basename
-    and the second one is num which is mandatory.
-    """
-    m = re.search('\{num', filename_format)
-    if not m:
-        raise ValueError("Missing named placeholder 'num' in format string")
-    return filename_format.format({"basename":basename, "num":num})
-
-	
 def make_jobs(jar, xml, data_paths, drs_paths,
               engine, queue, vmem, num_jobs, walltime, output_path=None, filename_format="{basename}_{num}.json"):
     jobs = []
 
     data_partitions = np.array_split(data_paths, num_jobs)
     drs_partitions = np.array_split(drs_paths, num_jobs)
     if output_path:
-        logger.info("Using stream runner für local output")
+        logger.info("Using stream runner for local output")
     else:
         logger.debug("Using std stream runner gathering output from all nodes")
 
     for num, (data, drs) in enumerate(zip(data_partitions, drs_partitions)):
         df = pd.DataFrame({'data_path': data, 'drs_path': drs})
+        df=df.copy()
+        df["bunch_index"] = num
+        
         if output_path:
             # create the filenames for each single local run
             file_name, _ = path.splitext(path.basename(output_path))
diff --git a/erna/scripts/process_fact_run_list.py b/erna/scripts/process_fact_run_list.py
@@ -5,7 +5,9 @@
 from os import path
 import os
 import erna
-from erna import stream_runner
+from erna.utils import create_filename_from_format
+from erna import stream_runner as stream_runner_std
+from erna import stream_runner_local_output as stream_runner_local
 import gridmap
 from gridmap import Job
 
@@ -14,18 +16,44 @@
 
 
 def make_jobs(jar, xml, aux_source_path, output_directory, df_mapping,  engine, queue,
-              vmem, num_jobs, walltime):
+              vmem, num_jobs, walltime, output_path=None, filename_format="{basename}_{num}.json"):
     jobs = []
+    
+    if output_path:
+        logger.info("Using stream runner for local output")
+    else:
+        logger.debug("Using std stream runner gathering output from all nodes")
+        
     # create job objects
     split_indices = np.array_split(np.arange(len(df_mapping)), num_jobs)
     for num, indices in enumerate(split_indices):
         df = df_mapping[indices.min(): indices.max()]
-
-        job = Job(stream_runner.run, [jar, xml, df, aux_source_path],
-                  queue=queue, walltime=walltime, engine=engine,
-                  mem_free='{}mb'.format(vmem))
-        jobs.append(job)
-
+        df=df.copy()
+        df["bunch_index"] = num
+        
+        if output_path:
+            # create the filenames for each single local run
+            file_name, _ = path.splitext(path.basename(output_path))
+            file_name = create_filename_from_format(filename_format, file_name, num)
+            out_path = path.dirname(output_path)
+            run = [jar, xml, df, path.join(out_path, file_name), aux_source_path]
+            stream_runner = stream_runner_local
+        else:
+            run = [jar, xml, df, aux_source_path]
+            stream_runner = stream_runner_std
+
+        jobs.append(
+            Job(stream_runner.run, 
+                run,
+                queue=queue, 
+                walltime=walltime, 
+                engine=engine,
+                mem_free='{}mb'.format(vmem)
+                )
+            )
+            
+    avg_num_files = np.mean([len(part) for part in split_indices])
+    logger.info("Created {} jobs with {} files each.".format(len(jobs), avg_num_files))
     return jobs
 
 
@@ -45,7 +73,17 @@ def make_jobs(jar, xml, aux_source_path, output_directory, df_mapping,  engine,
 @click.option("--log_level", type=click.Choice(['INFO', 'DEBUG', 'WARN']), help='increase output verbosity', default='INFO')
 @click.option('--port', help='The port through which to communicate with the JobMonitor', default=12856, type=int)
 @click.option('--local', default=False,is_flag=True,   help='Flag indicating whether jobs should be executed localy .')
-def main(file_list, jar, xml, aux_source, out, queue, walltime, engine, num_jobs, vmem, log_level, port, local):
+@click.option('--local_output', default=False, is_flag=True,
+              help='Flag indicating whether jobs write their output localy'
+              + 'to disk without gathering everything in the mother'
+              + 'process. In this case the output file only contains a'
+              + 'summary oth the processed jobs. The data ouput will be'
+              + 'in separate files',
+              show_default=True)
+@click.option('--local_output_format', default="{basename}_{num}.json", help="Give the file format for the local output funktionality."
+              + "%b will replace the out filename and %[1-9]n the given local number."
+              + "Default is: '{basename}_{num}.json'.Only works with option --local_output. ")
+def main(file_list, jar, xml, aux_source, out, queue, walltime, engine, num_jobs, vmem, log_level, port, local, local_output, local_output_format):
     '''
     Specify the path to a .json file as created by the fetch_runs.py script via the FILE_LIST argument.
     num_jobs will be created and executed on the cluster.
@@ -74,8 +112,19 @@ def main(file_list, jar, xml, aux_source, out, queue, walltime, engine, num_jobs
     os.makedirs(output_directory, exist_ok=True)
     logger.info("Writing output and temporary data  to {}".format(output_directory))
 
-
-    job_list = make_jobs(jarpath, xmlpath, aux_source_path, output_directory, df,  engine, queue, vmem, num_jobs, walltime)
+    if local_output:
+        job_list = make_jobs(jarpath, xmlpath, aux_source_path, 
+                             output_directory, df,  engine, queue, 
+                             vmem, num_jobs, walltime, 
+                             output_path=local_output_dir, 
+                             filename_format=local_output_format
+                             )
+    else:
+        job_list = make_jobs(jarpath, xmlpath, aux_source_path, 
+                             output_directory, df,  engine, queue, 
+                             vmem, num_jobs, walltime,
+                             )
+        
     job_outputs = gridmap.process_jobs(job_list, max_processes=num_jobs, local=local)
     erna.collect_output(job_outputs, out, df)
 
diff --git a/erna/utils.py b/erna/utils.py
@@ -93,3 +93,16 @@ def check_environment_on_node():
     subprocess.check_call(['which', 'java'])
     subprocess.check_call(['free', '-m'])
     subprocess.check_call(['java', '-Xmx512m', '-version'])
+    
+import re
+
+def create_filename_from_format(filename_format, basename, num):
+    """
+    Given a special format string, create a filename_format with the basename and a given number.
+    There are two named variables that can be used, one is basename which inserts the basename
+    and the second one is num which is mandatory.
+    """
+    m = re.search('\{num', filename_format)
+    if not m:
+        raise ValueError("Missing named placeholder 'num' in format string")
+    return filename_format.format({"basename":basename, "num":num})
diff --git a/setup.py b/setup.py
@@ -2,7 +2,7 @@
 
 setup(
     name='erna',
-    version='0.8.1',
+    version='0.8.2',
     description='Easy RuN Access. Tools that help to do batch processing of FACT data',
     url='https://github.com/fact-project/erna',
     author='Kai Brügge, Jens Buss, Maximilian Nöthe',