Clean up YAML after job completion, add example configs for dry_run and cifar benchmarks

IKACE · IKACE · commit f0927d8fe586 · 2022-10-07T00:46:33.000-04:00
diff --git a/benchmark/configs/cifar_cpu/cifar_cpu_docker.yml b/benchmark/configs/cifar_cpu/cifar_cpu_docker.yml
@@ -1,8 +1,8 @@
-# Configuration file of FAR training experiment using Aggregator & Executor containers
+# Configuration file of FAR training experiment using Aggregator & Executor containers and docker for container deployment
 
 # ========== Container configuration ========== 
 # whether to use container deployment
-use_container: True
+use_container: docker
 
 # containers need port-mapping to communicate with host machine
 # E.g., 1 aggregator and 2 executor, ports: [Aggr, Exec1, Exec2]
@@ -47,15 +47,15 @@ setup_commands:
 
 # We use fixed paths in job_conf as they will be accessed inside containers
 job_conf: 
-    - job_name: cifar_ctnr                   # Generate logs under this folder: log_path/job_name/time_stamp
+    - job_name: cifar_docker                   # Generate logs under this folder: log_path/job_name/time_stamp
     - log_path: /FedScale/benchmark # Path of log files
     - num_participants: 4                      # Number of participants per round, we use K=100 in our paper, large K will be much slower
     - data_set: cifar10                     # Dataset: openImg, google_speech, stackoverflow
     - data_dir: /FedScale/benchmark/dataset/data/    # Path of the dataset
     - model: shufflenet_v2_x2_0              # NOTE: Please refer to our model zoo README and use models for these small image (e.g., 32x32x3) inputs
 #    - model_zoo: fedscale-zoo              # Default zoo (torchcv) uses the pytorchvision zoo, which can not support small images well
     - eval_interval: 10                     # How many rounds to run a testing on the testing set
-    - rounds: 20                          # Number of rounds to run this training. We use 1000 in our paper, while it may converge w/ ~400 rounds
+    - rounds: 21                          # Number of rounds to run this training. We use 1000 in our paper, while it may converge w/ ~400 rounds
     - filter_less: 0                       # Remove clients w/ less than 21 samples
     - num_loaders: 2
     - local_steps: 20
diff --git a/benchmark/configs/cifar_cpu/cifar_cpu_k8s.yml b/benchmark/configs/cifar_cpu/cifar_cpu_k8s.yml
@@ -0,0 +1,49 @@
+# Configuration file of FAR training experiment using Aggregator & Executor containers and k8s for container deployment
+
+# ========== Container configuration ========== 
+# whether to use container deployment
+use_container: k8s
+
+# containers need a data-path mount to facilitate dataset reuse
+# We assume the same data-path is used on all host machines
+data_path: $FEDSCALE_HOME/benchmark
+
+# ========== Cluster configuration ========== 
+# k8s-specific
+# number of aggregators, right now we only support a single aggregator
+# placeholder for supporting hierarchical aggregator in the future
+num_aggregators: 1
+
+# k8s-specific
+# number of executors
+num_executors: 2
+
+auth:
+    ssh_user: ""
+    ssh_private_key: ~/.ssh/id_rsa
+
+# cmd to run before we can indeed run FAR (in order)
+setup_commands:
+    
+
+# ========== Additional job configuration ========== 
+# Default parameters are specified in config_parser.py, wherein more description of the parameter can be found
+
+# We use fixed paths in job_conf as they will be accessed inside containers
+job_conf: 
+    - job_name: cifar_k8s                   # Generate logs under this folder: log_path/job_name/time_stamp
+    - log_path: /FedScale/benchmark # Path of log files
+    - num_participants: 4                      # Number of participants per round, we use K=100 in our paper, large K will be much slower
+    - data_set: cifar10                     # Dataset: openImg, google_speech, stackoverflow
+    - data_dir: /FedScale/benchmark/dataset/data/    # Path of the dataset
+    - model: shufflenet_v2_x2_0              # NOTE: Please refer to our model zoo README and use models for these small image (e.g., 32x32x3) inputs
+#    - model_zoo: fedscale-zoo              # Default zoo (torchcv) uses the pytorchvision zoo, which can not support small images well
+    - eval_interval: 10                     # How many rounds to run a testing on the testing set
+    - rounds: 21                          # Number of rounds to run this training. We use 1000 in our paper, while it may converge w/ ~400 rounds
+    - filter_less: 0                       # Remove clients w/ less than 21 samples
+    - num_loaders: 2
+    - local_steps: 20
+    - learning_rate: 0.05
+    - batch_size: 32
+    - test_bsz: 32
+    - use_cuda: False
diff --git a/benchmark/configs/dry_run/dry_run_docker.yml b/benchmark/configs/dry_run/dry_run_docker.yml
@@ -1,8 +1,8 @@
-# Configuration file of dry run experiment using Aggregator & Executor containers
+# Configuration file of dry run experiment using Aggregator & Executor containers and docker for container deployment
 
 # ========== Container configuration ========== 
 # whether to use container deployment
-use_container: True
+use_container: docker
 
 # containers need port-mapping to communicate with host machine
 # E.g., 1 aggregator and 2 executor, ports: [Aggr, Exec1, Exec2]
@@ -48,7 +48,7 @@ setup_commands:
 
 # We use fixed paths in job_conf as they will be accessed inside containers
 job_conf: 
-    - job_name: dryrun_ctnr                   # Generate logs under this folder: log_path/job_name/time_stamp
+    - job_name: dryrun_docker                   # Generate logs under this folder: log_path/job_name/time_stamp
     - log_path: /FedScale/benchmark # Path of log files
     - num_participants: 4                      # Number of participants per round, we use K=100 in our paper, large K will be much slower
     - data_set: cifar10                     # Dataset: openImg, google_speech, stackoverflow
diff --git a/benchmark/configs/dry_run/dry_run_k8s.yml b/benchmark/configs/dry_run/dry_run_k8s.yml
@@ -0,0 +1,48 @@
+# Configuration file of dry run experiment using Aggregator & Executor containers and k8s for container deployment
+
+# ========== Container configuration ========== 
+# whether to use container deployment
+use_container: k8s
+
+# containers need a data-path mount to facilitate dataset reuse
+# We assume the same data-path is used on all host machines
+data_path: $FEDSCALE_HOME/benchmark
+
+# ========== Cluster configuration ========== 
+# k8s-specific
+# number of aggregators, right now we only support a single aggregator
+# placeholder for supporting hierarchical aggregator in the future
+num_aggregators: 1
+
+# k8s-specific
+# number of executors
+num_executors: 2
+
+auth:
+    ssh_user: ""
+    ssh_private_key: ~/.ssh/id_rsa
+
+# cmd to run before we can indeed run FAR (in order)
+setup_commands:
+
+
+# ========== Additional job configuration ========== 
+# Default parameters are specified in config_parser.py, wherein more description of the parameter can be found
+
+# We use fixed paths in job_conf as they will be accessed inside containers
+job_conf: 
+    - job_name: dryrun_k8s                   # Generate logs under this folder: log_path/job_name/time_stamp
+    - log_path: /FedScale/benchmark # Path of log files
+    - num_participants: 4                      # Number of participants per round, we use K=100 in our paper, large K will be much slower
+    - data_set: cifar10                     # Dataset: openImg, google_speech, stackoverflow
+    - data_dir: /FedScale/benchmark/dataset/data/    # Path of the dataset
+    - model: resnet18                            # Models: e.g., shufflenet_v2_x2_0, mobilenet_v2, resnet34, albert-base-v2# - gradient_policy: yogi                 # {"fed-yogi", "fed-prox", "fed-avg"}, "fed-avg" by default
+    - eval_interval: 10                     # How many rounds to run a testing on the testing set
+    - rounds: 21                       # Number of rounds to run this training. We use 1000 in our paper, while it may converge w/ ~400 rounds
+    - filter_less: 0                       # Remove clients w/ less than 21 samples
+    - num_loaders: 2
+    - local_steps: 20
+    - learning_rate: 0.001
+    - batch_size: 32
+    - test_bsz: 32
+    - use_cuda: False
diff --git a/benchmark/configs/femnist/conf_docker.yml b/benchmark/configs/femnist/conf_docker.yml
@@ -48,7 +48,7 @@ setup_commands:
 
 # We use fixed paths in job_conf as they will be accessed inside containers
 job_conf: 
-    - job_name: femnist_ctnr                   # Generate logs under this folder: log_path/job_name/time_stamp
+    - job_name: femnist_docker                   # Generate logs under this folder: log_path/job_name/time_stamp
     - log_path: /FedScale/benchmark # Path of log files
     - num_participants: 50                  # Number of participants per round, we use K=100 in our paper, large K will be much slower
     - data_set: femnist                     # Dataset: openImg, google_speech, stackoverflow
diff --git a/benchmark/configs/femnist/conf_k8s.yml b/benchmark/configs/femnist/conf_k8s.yml
@@ -6,7 +6,7 @@ use_container: k8s
 
 # containers need a data-path mount to facilitate dataset reuse
 # We assume the same data-path is used on all host machines
-data_path: /users/yilegu/benchmark
+data_path: $FEDSCALE_HOME/benchmark
 
 # ========== Cluster configuration ========== 
 # k8s-specific
@@ -20,7 +20,7 @@ num_executors: 2
 
 
 auth:
-    ssh_user: "yilegu"
+    ssh_user: ""
     ssh_private_key: ~/.ssh/id_rsa
 
 # cmd to run before we can indeed run FAR (in order)
@@ -32,7 +32,7 @@ setup_commands:
 
 # We use fixed paths in job_conf as they will be accessed inside containers
 job_conf: 
-    - job_name: femnist_ctnr                   # Generate logs under this folder: log_path/job_name/time_stamp
+    - job_name: femnist_k8s                   # Generate logs under this folder: log_path/job_name/time_stamp
     - log_path: /FedScale/benchmark # Path of log files
     - num_participants: 5                  # Number of participants per round, we use K=100 in our paper, large K will be much slower
     - data_set: femnist                     # Dataset: openImg, google_speech, stackoverflow
diff --git a/docker/driver.py b/docker/driver.py
@@ -263,6 +263,9 @@ def terminate(job_name):
         config.load_kube_config()
         core_api = client.CoreV1Api()
         for name, meta_dict in job_meta['k8s_dict'].items():
+            if os.path.exists(meta_dict["yaml_path"]):
+                os.remove(meta_dict["yaml_path"])
+                
             print(f"Shutting down container {name}...")
             core_api.delete_namespaced_pod(name, namespace="default")
 
@@ -328,12 +331,14 @@ def submit_to_k8s(yaml_conf):
             "data_path": yaml_conf["data_path"],
             "pod_name": exec_name
         }
+
+        exec_yaml_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), f'{exec_name}.yaml')
+        generate_exec_template(exec_config, exec_yaml_path)
         k8s_dict[exec_name] = {
             "type": "executor",
-            "rank_id": rank_id
+            "rank_id": rank_id,
+            "yaml_path": exec_yaml_path
         }
-        exec_yaml_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), f'{exec_name}.yaml')
-        generate_exec_template(exec_config, exec_yaml_path)
         print(f'Submitting executor container {exec_name} to k8s...')
         # TODO: logging?
         utils.create_from_yaml(k8s_client, exec_yaml_path, namespace="default")
@@ -355,7 +360,8 @@ def submit_to_k8s(yaml_conf):
     k8s_dict[aggr_name] = {
         "type": "aggregator",
         "ip": aggr_ip,
-        "rank_id": 0
+        "rank_id": 0,
+        "yaml_path": aggr_yaml_path
     }
 
     # TODO: refactor the code so that docker/k8s version invoke the same init function