99from gridmap import Job
1010
1111import erna
12- from erna import stream_runner
12+ from erna .utils import create_filename_from_format
13+ from erna import stream_runner as stream_runner_std
14+ from erna import stream_runner_local_output as stream_runner_local
15+
1316import erna .datacheck_conditions as dcc
1417
1518logger = logging .getLogger (__name__ )
1619
1720
18- def make_jobs (jar , xml , aux_source_path , output_directory , df_mapping , engine , queue , vmem , num_runs_per_bunch , walltime ):
21+ def make_jobs (jar , xml , aux_source_path , output_directory , df_mapping , engine , queue , vmem , num_runs_per_bunch , walltime , output_path = None , filename_format = "{basename}_{num}.json" ):
1922 jobs = []
23+
24+ if output_path :
25+ logger .info ("Using stream runner for local output" )
26+ else :
27+ logger .debug ("Using std stream runner gathering output from all nodes" )
28+
2029 # create job objects
30+
2131 df_mapping ["bunch_index" ]= np .arange (len (df_mapping )) // num_runs_per_bunch
2232 for num , df in df_mapping .groupby ("bunch_index" ):
2333 df = df .copy ()
2434 df ["bunch_index" ] = num
25- job = Job (stream_runner .run , [jar , xml , df , aux_source_path ], queue = queue , walltime = walltime , engine = engine , mem_free = '{}mb' .format (vmem ))
26- jobs .append (job )
35+
36+ if output_path :
37+ # create the filenames for each single local run
38+ file_name , _ = path .splitext (path .basename (output_path ))
39+ file_name = create_filename_from_format (filename_format , file_name , num )
40+ out_path = path .dirname (output_path )
41+ run = [jar , xml , df , path .join (out_path , file_name ), aux_source_path ]
42+ stream_runner = stream_runner_local
43+ else :
44+ run = [jar , xml , df , aux_source_path ]
45+ stream_runner = stream_runner_std
46+
47+ jobs .append (
48+ Job (stream_runner .run ,
49+ run ,
50+ queue = queue ,
51+ walltime = walltime ,
52+ engine = engine ,
53+ mem_free = '{}mb' .format (vmem )
54+ )
55+ )
56+ avg_num_files = np .mean ([len (part ) for num , part in df_mapping .groupby ("bunch_index" )])
57+ logger .info ("Created {} jobs with {} files each." .format (len (jobs ), avg_num_files ))
2758
2859 return jobs
2960
@@ -47,9 +78,20 @@ def make_jobs(jar, xml, aux_source_path, output_directory, df_mapping, engine,
4778@click .option ('--conditions' , help = 'Name of the data conditions as given in datacheck_conditions.py e.g standard' , default = 'standard' )
4879@click .option ('--max_delta_t' , default = 30 , help = 'Maximum time difference (minutes) allowed between drs and data files.' , type = click .INT )
4980@click .option ('--local' , default = False ,is_flag = True , help = 'Flag indicating whether jobs should be executed localy .' )
81+ @click .option ('--local_output' , default = False , is_flag = True ,
82+ help = 'Flag indicating whether jobs write their output localy'
83+ + 'to disk without gathering everything in the mother'
84+ + 'process. In this case the output file only contains a'
85+ + 'summary oth the processed jobs. The data ouput will be'
86+ + 'in separate files' ,
87+ show_default = True )
88+ @click .option ('--local_output_format' , default = "{basename}_{num}.json" , help = "Give the file format for the local output funktionality."
89+ + "%b will replace the out filename and %[1-9]n the given local number."
90+ + "Default is: '{basename}_{num}.json'.Only works with option --local_output. " )
91+ @click .password_option (help = 'password to read from the always awesome RunDB' )
5092@click .option ('--yes' , help = "Assume 'yes'if your asked to continue processing and start jobs" , default = False , is_flag = True )
5193@click .password_option (help = 'password to read from the always awesome RunDB' )
52- def main (earliest_night , latest_night , data_dir , jar , xml , aux_source , out , queue , walltime , engine , num_runs , vmem , log_level , port , source , conditions , max_delta_t , local , yes , password ):
94+ def main (earliest_night , latest_night , data_dir , jar , xml , aux_source , out , queue , walltime , engine , num_runs , vmem , log_level , port , source , conditions , max_delta_t , local , local_output , local_output_format , yes , password ):
5395
5496 level = logging .INFO
5597 if log_level is 'DEBUG' :
@@ -74,19 +116,31 @@ def main(earliest_night, latest_night, data_dir, jar, xml, aux_source, out, queu
74116 factdb = sqlalchemy .create_engine ("mysql+pymysql://factread:{}@129.194.168.95/factdata" .format (password ))
75117 data_conditions = dcc .conditions [conditions ]
76118 df_runs = erna .load (earliest_night , latest_night , data_dir , source_name = source , timedelta_in_minutes = max_delta_t , factdb = factdb , data_conditions = data_conditions )
77-
119+
78120 # check for missing data and fix possible wrong file extension (.fz->.gz)
79121 df = erna .test_data_path (df_runs , "data_path" )
122+
80123 df_runs = df [df ['data_file_exists' ]]
81124 df_runs_missing = df [np .logical_not (df ['data_file_exists' ])]
82-
83- logger .warn ("Missing {} dataruns due to missing datafiles" .format (len (df_runs_missing )))
84125
126+ logger .warn ("Missing {} dataruns due to missing datafiles" .format (len (df_runs_missing )))
85127 logger .info ("Would process {} jobs with {} runs per job" .format (len (df_runs )// num_runs , num_runs ))
86128 if not yes :
87129 click .confirm ('Do you want to continue processing and start jobs?' , abort = True )
88130
89- job_list = make_jobs (jarpath , xmlpath , aux_source_path , output_directory , df_runs , engine , queue , vmem , num_runs , walltime )
131+ if local_output :
132+ job_list = make_jobs (jarpath , xmlpath , aux_source_path ,
133+ output_directory , df_runs , engine , queue ,
134+ vmem , num_runs , walltime ,
135+ output_path = local_output_dir ,
136+ filename_format = local_output_format
137+ )
138+ else :
139+ job_list = make_jobs (jarpath , xmlpath , aux_source_path ,
140+ output_directory , df_runs , engine , queue ,
141+ vmem , num_runs , walltime
142+ )
143+
90144 job_outputs = gridmap .process_jobs (job_list , max_processes = len (job_list ), local = local )
91145 erna .collect_output (job_outputs , out , df_runs )
92146
0 commit comments