Merge pull request #206 from Pressio/issue_159

eparish1 · web-flow · commit ab159f6ab0ed · 2024-10-29T11:30:43.000-05:00
parallelize sampling
diff --git a/romtools/hyper_reduction/deim.py b/romtools/hyper_reduction/deim.py
@@ -83,7 +83,7 @@ def _deim_get_indices_sharedmem(U):
     return indices
 
 
-class _DistDeimData:
+class _dist_deim_data:
     def __init__(self, i, r):
         self.local_indices = np.array([int(i)])
         self.owning_ranks = np.array([int(r)])
@@ -96,26 +96,26 @@ def append(self, i, r):
 
 def _deim_get_indices_distributed(U, comm):
     m = np.shape(U)[1]
-    local_index, found_rank = la.argmax(np.abs(U[:, 0]), comm)
-    result = _DistDeimData(local_index, found_rank)
+    local_index, foundRank = la.argmax(np.abs(U[:, 0]), comm)
+    result = _dist_deim_data(local_index, foundRank)
     if m == 1:
         return result.local_indices, result.owning_ranks
 
-    my_rank = comm.Get_rank()
+    myRank = comm.Get_rank()
     LHS, RHS, C = np.array([]), np.array([]), np.array([])
     for ell in range(1, m):
-        indices = result.local_indices[result.owning_ranks==my_rank]
+        indices = result.local_indices[result.owning_ranks==myRank]
         LHS = np.array([]) if indices.size == 0 else U[indices, 0:ell]
         RHS = np.array([]) if indices.size == 0 else U[indices, ell]
 
         A, b = la.move_distributed_linear_system_to_rank_zero(LHS, RHS, comm)
-        if my_rank == 0:
+        if myRank == 0:
             C = np.linalg.solve(A, b)
         C = comm.bcast(C, root=0)
 
         residual = U[:, ell] - U[:, 0:ell] @ C
-        local_index, found_rank = la.argmax(np.abs(residual), comm)
-        result.append(local_index, found_rank)
+        local_index, foundRank = la.argmax(np.abs(residual), comm)
+        result.append(local_index, foundRank)
 
     return result.local_indices, result.owning_ranks
 
diff --git a/romtools/workflows/sampling/sampling.py b/romtools/workflows/sampling/sampling.py
@@ -46,19 +46,26 @@
 import os
 import time
 import numpy as np
+import concurrent.futures
+import multiprocessing
 
 from romtools.workflows.workflow_utils import create_empty_dir
 from romtools.workflows.models import Model
 from romtools.workflows.parameter_spaces import ParameterSpace
 
 
+def _get_run_id_from_run_dir(run_dir):
+    return int(run_dir.split('_')[-1])
+
+
 def _create_parameter_dict(parameter_names, parameter_values):
     return dict(zip(parameter_names, parameter_values))
 
 
 def run_sampling(model: Model,
                  parameter_space: ParameterSpace,
                  absolute_sampling_directory: str,
+                 evaluation_concurrency = 1,
                  number_of_samples: int = 10,
                  random_seed: int = 1,
                  dry_run: bool = False,
@@ -67,6 +74,17 @@ def run_sampling(model: Model,
     Core algorithm
     '''
 
+    # we use here spawn because the default fork causes issues with mpich,
+    # see here: https://github.com/Pressio/rom-tools-and-workflows/pull/206
+    #
+    # to read more about fork/spawn:
+    #   https://docs.python.org/3/library/multiprocessing.html#multiprocessing-start-methods
+    #
+    # and
+    #   https://docs.python.org/3/library/concurrent.futures.html#concurrent.futures.ProcessPoolExecutor
+    #
+    mp_cntxt=multiprocessing.get_context("spawn")
+
     np.random.seed(random_seed)
 
     # create parameter samples
@@ -85,40 +103,65 @@ def run_sampling(model: Model,
         model.populate_run_directory(run_directory, parameter_dict)
         run_directories.append(run_directory)
 
-    # Run cases if dry_run is not set
+    # Print MPI warnings
+    print("""
+    Warning: If you are using your model with MPI via a direct call to `mpirun -n ...`,
+    be aware that this may or may not work for issues that are purely related to MPI.
+    """)
     if not dry_run:
-        run_times = np.zeros(number_of_samples)
-        for sample_index in range(0, number_of_samples):
-            print("=======  Sample " + str(sample_index) + " ============")
-            run_directory = f'{run_directory_base}{sample_index}'
-            if "passed.txt" in os.listdir(run_directory) and not overwrite:
-                print("Skipping (Sample has already run successfully)")
-            else:
+        # Run cases
+        if evaluation_concurrency == 1:
+            run_times = np.zeros(number_of_samples)
+            for sample_index in range(0, number_of_samples):
+                print("=======  Sample " + str(sample_index) + " ============")
                 print("Running")
-                parameter_dict = _create_parameter_dict(parameter_names, parameter_samples[sample_index])
-                run_times[sample_index] = run_sample(run_directory, model, parameter_dict)
-                sample_stats_save_directory = f'{run_directory_base}{sample_index}/../'
-                np.savez(f'{sample_stats_save_directory}/sampling_stats',
-                            run_times=run_times)
+                run_directory = f'{run_directory_base}{sample_index}'
+                if "passed.txt" in os.listdir(run_directory) and not overwrite:
+                    print("Skipping (Sample has already run successfully)")
+                else:
+                    print("Running")
+                    parameter_dict = _create_parameter_dict(parameter_names, parameter_samples[sample_index])
+                    run_times[sample_index] = run_sample(run_directory, model, parameter_dict)
+                    sample_stats_save_directory = f'{run_directory_base}{sample_index}/../'
+                    np.savez(f'{sample_stats_save_directory}/sampling_stats',
+                             run_times=run_times)
+        else:
+            #Identify samples to run
+            samples_to_run = []
+            for sample_index in range(0, number_of_samples):
+                run_directory = f'{run_directory_base}{sample_index}'
+                if "passed.txt" in os.listdir(run_directory) and not overwrite:
+                    print(f"Skipping sample {sample_index} (Sample has already run successfully)")
+                    pass
+                else:
+                    samples_to_run.append(sample_index)
+            with concurrent.futures.ProcessPoolExecutor(max_workers = evaluation_concurrency, mp_context=mp_cntxt) as executor:
+                these_futures = [executor.submit(run_sample,
+                                 f'{run_directory_base}{sample_id}', model,
+                                 _create_parameter_dict(parameter_names, parameter_samples[sample_id]))
+                                 for sample_id in samples_to_run]
 
-    return run_directories
+                # Wait for all processes to finish
+                concurrent.futures.wait(these_futures)
 
+            run_times = [future.result() for future in these_futures]
+            sample_stats_save_directory = f'{run_directory_base}{sample_index}/../'
+            np.savez(f'{sample_stats_save_directory}/sampling_stats', run_times=run_times)
+
+    return run_directories
 
-def run_sample(run_directory: str, model: Model,
-               parameter_sample: dict):
-    '''
-    Execute individual sample
-    '''
 
+def run_sample(run_directory: str, model: Model, parameter_sample: dict):
+    run_id = _get_run_id_from_run_dir(run_directory)
     ts = time.time()
     flag = model.run_model(run_directory, parameter_sample)
     tf = time.time()
     run_time = tf - ts
 
     if flag == 0:
+        print(f"Sample {run_id} is complete, run time = {run_time}")
         np.savetxt(os.path.join(run_directory, 'passed.txt'), np.array([0]), '%i')
-        print(f"Sample complete, run time = {run_time}")
     else:
-        print(f"Sample failed, run time = {run_time}")
+        print(f"Sample {run_id} failed, run time = {run_time}")
     print(" ")
     return run_time
diff --git a/tests/romtools/workflows/sampling/test_sampling.py b/tests/romtools/workflows/sampling/test_sampling.py
@@ -1,10 +1,13 @@
 import pytest
 import os
 import numpy as np
+import time
 
 from romtools.workflows.sampling.sampling import run_sampling
 from romtools.workflows.parameter_spaces import MonteCarloSampler, UniformParameterSpace
 
+def _get_run_id(run_dir):
+    return int(run_dir.split('_')[-1])
 
 class MockModel:
     def __init__(self):
@@ -17,10 +20,18 @@ def populate_run_directory(self, run_dir, parameter_sample):
         np.savez(f'{run_dir}/parameter_values.npz', parameter_values=parameter_values)
 
     def run_model(self, run_dir, parameter_sample):
+        print("running model in ", run_dir)
         params_input = np.load(f'{run_dir}/parameter_values.npz')['parameter_values']
         for i in range(0, len(parameter_sample)):
             parameter_name = list(parameter_sample.keys())[i]
             assert params_input[i] == parameter_sample[parameter_name]
+        np.savetxt(f'{run_dir}/passed.txt', np.array([0]), '%i')
+
+        # add artificial lag centered around run_id=5
+        # such that the closer the ID is to 5, the less the task waits.
+        # totally arbitrary choice.
+        seconds_to_wait = abs(_get_run_id(run_dir) - 5) * 4
+        time.sleep( seconds_to_wait )
         return 0
 
 
@@ -32,9 +43,9 @@ def run_sampler(tmp_path, dry_run=False, overwrite=True):
     my_model = MockModel()
     run_directories = run_sampling(my_model, my_parameter_space,
                                    absolute_sampling_directory=tmp_path,
-                                   number_of_samples=10, dry_run=dry_run,
+                                   evaluation_concurrency=2,
+                                   number_of_samples=10,dry_run=dry_run,
                                    overwrite=overwrite)
-
     assert(len(run_directories)==10)
 
     timestamps = []