diff --git a/benchmark/configs/async_fl/async_fl.yml b/benchmark/configs/async_fl/async_fl.yml deleted file mode 100644 index 4d1448ef..00000000 --- a/benchmark/configs/async_fl/async_fl.yml +++ /dev/null @@ -1,61 +0,0 @@ -# Configuration file of FAR training experiment - -# ========== Cluster configuration ========== -# ip address of the parameter server (need 1 GPU process) -ps_ip: localhost - -# ip address of each worker:# of available gpus process on each gpu in this node -# Note that if we collocate ps and worker on same GPU, then we need to decrease this number of available processes on that GPU by 1 -# E.g., master node has 4 available processes, then 1 for the ps, and worker should be set to: worker:3 -worker_ips: - - localhost:[2,2,2,2] - -exp_path: $FEDSCALE_HOME/fedscale/cloud - -# Entry function of executor and aggregator under $exp_path -executor_entry: ../../examples/async_fl/async_executor.py - -aggregator_entry: ../../examples/async_fl/async_aggregator.py - -auth: - ssh_user: "" - ssh_private_key: ~/.ssh/id_rsa - -# cmd to run before we can indeed run FAR (in order) -setup_commands: - - source $HOME/anaconda3/bin/activate fedscale - -# ========== Additional job configuration ========== -# Default parameters are specified in config_parser.py, wherein more description of the parameter can be found - -# NOTE: We are supporting and improving the following implementation (Async FL) in FedScale: - # - "PAPAYA: Practical, Private, and Scalable Federated Learning", MLSys, 2022 - # - "Federated Learning with Buffered Asynchronous Aggregation", AISTATS, 2022 - -# We appreciate you to contribute and/or report bugs. Thank you! - -job_conf: - - job_name: async_femnist # Generate logs under this folder: log_path/job_name/time_stamp - - log_path: $FEDSCALE_HOME/benchmark # Path of log files - - data_set: femnist # Dataset: openImg, google_speech, stackoverflow - - data_dir: $FEDSCALE_HOME/benchmark/dataset/data/femnist # Path of the dataset - - data_map_file: $FEDSCALE_HOME/benchmark/dataset/data/femnist/client_data_mapping/train.csv # Allocation of data to each client, turn to iid setting if not provided - - device_conf_file: $FEDSCALE_HOME/benchmark/dataset/data/device_info/client_device_capacity # Path of the client trace - - device_avail_file: $FEDSCALE_HOME/benchmark/dataset/data/device_info/client_behave_trace - - model: resnet18 # NOTE: Please refer to our model zoo README and use models for these small image (e.g., 32x32x3) inputs -# - model_zoo: fedscale-zoo - - eval_interval: 5 # How many rounds to run a testing on the testing set - - rounds: 1000 # Number of rounds to run this training. We use 1000 in our paper, while it may converge w/ ~400 rounds - - filter_less: 21 # Remove clients w/ less than 21 samples - - num_loaders: 2 - - local_steps: 5 - - learning_rate: 0.05 - - batch_size: 20 - - test_bsz: 20 - - ps_port: 12342 - - use_cuda: True - - overcommitment: 1.0 - - arrival_interval: 5 - - max_staleness: 5 - - max_concurrency: 100 - - async_buffer: 50 # Number of updates need to be aggregated before generating new model version diff --git a/benchmark/configs/cifar_cpu/cifar_cpu.yml b/benchmark/configs/cifar_cpu/cifar_cpu.yml index c361d1d3..007c8b6d 100644 --- a/benchmark/configs/cifar_cpu/cifar_cpu.yml +++ b/benchmark/configs/cifar_cpu/cifar_cpu.yml @@ -35,7 +35,7 @@ job_conf: - data_set: cifar10 # Dataset: openImg, google_speech, stackoverflow - data_dir: $FEDSCALE_HOME/benchmark/dataset/data/ # Path of the dataset - model: shufflenet_v2_x2_0 # NOTE: Please refer to our model zoo README and use models for these small image (e.g., 32x32x3) inputs -# - model_zoo: fedscale-zoo # Default zoo (torchcv) uses the pytorchvision zoo, which can not support small images well +# - model_zoo: fedscale-torch-zoo # Default zoo (torchcv) uses the pytorchvision zoo, which can not support small images well - eval_interval: 5 # How many rounds to run a testing on the testing set - rounds: 600 # Number of rounds to run this training. We use 1000 in our paper, while it may converge w/ ~400 rounds - filter_less: 0 # Remove clients w/ less than 21 samples diff --git a/benchmark/configs/docker_deploy/cifar_cpu_docker.yml b/benchmark/configs/docker_deploy/cifar_cpu_docker.yml index 0106799a..86ec9678 100644 --- a/benchmark/configs/docker_deploy/cifar_cpu_docker.yml +++ b/benchmark/configs/docker_deploy/cifar_cpu_docker.yml @@ -54,7 +54,7 @@ job_conf: - data_set: cifar10 # Dataset: openImg, google_speech, stackoverflow - data_dir: /FedScale/benchmark/dataset/data/ # Path of the dataset - model: shufflenet_v2_x2_0 # NOTE: Please refer to our model zoo README and use models for these small image (e.g., 32x32x3) inputs -# - model_zoo: fedscale-zoo # Default zoo (torchcv) uses the pytorchvision zoo, which can not support small images well +# - model_zoo: fedscale-torch-zoo # Default zoo (torchcv) uses the pytorchvision zoo, which can not support small images well - eval_interval: 10 # How many rounds to run a testing on the testing set - rounds: 21 # Number of rounds to run this training. We use 1000 in our paper, while it may converge w/ ~400 rounds - filter_less: 0 # Remove clients w/ less than 21 samples diff --git a/benchmark/configs/docker_deploy/femnist_docker.yml b/benchmark/configs/docker_deploy/femnist_docker.yml index 87521a88..4069d362 100644 --- a/benchmark/configs/docker_deploy/femnist_docker.yml +++ b/benchmark/configs/docker_deploy/femnist_docker.yml @@ -58,7 +58,7 @@ job_conf: - device_conf_file: /FedScale/benchmark/dataset/data/device_info/client_device_capacity # Path of the client trace - device_avail_file: /FedScale/benchmark/dataset/data/device_info/client_behave_trace - model: resnet18 # NOTE: Please refer to our model zoo README and use models for these small image (e.g., 32x32x3) inputs -# - model_zoo: fedscale-zoo +# - model_zoo: fedscale-torch-zoo - eval_interval: 10 # How many rounds to run a testing on the testing set - rounds: 20 # Number of rounds to run this training. We use 1000 in our paper, while it may converge w/ ~400 rounds - filter_less: 21 # Remove clients w/ less than 21 samples diff --git a/benchmark/configs/femnist/conf.yml b/benchmark/configs/femnist/conf.yml index aaa2f17f..e59f65b6 100644 --- a/benchmark/configs/femnist/conf.yml +++ b/benchmark/configs/femnist/conf.yml @@ -38,7 +38,7 @@ job_conf: - device_conf_file: $FEDSCALE_HOME/benchmark/dataset/data/device_info/client_device_capacity # Path of the client trace - device_avail_file: $FEDSCALE_HOME/benchmark/dataset/data/device_info/client_behave_trace - model: resnet18 # NOTE: Please refer to our model zoo README and use models for these small image (e.g., 32x32x3) inputs -# - model_zoo: fedscale-zoo +# - model_zoo: fedscale-torch-zoo - eval_interval: 10 # How many rounds to run a testing on the testing set - rounds: 1000 # Number of rounds to run this training. We use 1000 in our paper, while it may converge w/ ~400 rounds - filter_less: 21 # Remove clients w/ less than 21 samples diff --git a/benchmark/configs/k8s_deploy/cifar_cpu_k8s.yml b/benchmark/configs/k8s_deploy/cifar_cpu_k8s.yml index 57c9c94a..5ba3179e 100644 --- a/benchmark/configs/k8s_deploy/cifar_cpu_k8s.yml +++ b/benchmark/configs/k8s_deploy/cifar_cpu_k8s.yml @@ -36,7 +36,7 @@ job_conf: - data_set: cifar10 # Dataset: openImg, google_speech, stackoverflow - data_dir: /FedScale/benchmark/dataset/data/ # Path of the dataset - model: shufflenet_v2_x2_0 # NOTE: Please refer to our model zoo README and use models for these small image (e.g., 32x32x3) inputs -# - model_zoo: fedscale-zoo # Default zoo (torchcv) uses the pytorchvision zoo, which can not support small images well +# - model_zoo: fedscale-torch-zoo # Default zoo (torchcv) uses the pytorchvision zoo, which can not support small images well - eval_interval: 10 # How many rounds to run a testing on the testing set - rounds: 21 # Number of rounds to run this training. We use 1000 in our paper, while it may converge w/ ~400 rounds - filter_less: 0 # Remove clients w/ less than 21 samples diff --git a/benchmark/configs/k8s_deploy/femnist_k8s.yml b/benchmark/configs/k8s_deploy/femnist_k8s.yml index 90c85b6e..f38ee48f 100644 --- a/benchmark/configs/k8s_deploy/femnist_k8s.yml +++ b/benchmark/configs/k8s_deploy/femnist_k8s.yml @@ -40,7 +40,7 @@ job_conf: - device_conf_file: /FedScale/benchmark/dataset/data/device_info/client_device_capacity # Path of the client trace - device_avail_file: /FedScale/benchmark/dataset/data/device_info/client_behave_trace - model: resnet18 # NOTE: Please refer to our model zoo README and use models for these small image (e.g., 32x32x3) inputs -# - model_zoo: fedscale-zoo +# - model_zoo: fedscale-torch-zoo - eval_interval: 10 # How many rounds to run a testing on the testing set - rounds: 21 # Number of rounds to run this training. We use 1000 in our paper, while it may converge w/ ~400 rounds - filter_less: 21 # Remove clients w/ less than 21 samples diff --git a/benchmark/configs/tensorflow_engine/tf-engine.yml b/benchmark/configs/tensorflow_engine/tf-engine.yml deleted file mode 100644 index 14031a71..00000000 --- a/benchmark/configs/tensorflow_engine/tf-engine.yml +++ /dev/null @@ -1,47 +0,0 @@ -# Configuration file of running tensorflow backend - -# ========== Cluster configuration ========== -# ip address of the parameter server (need 1 GPU process) -ps_ip: 10.0.0.1 - -# ip address of each worker:# of available gpus process on each gpu in this node -# Note that if we collocate ps and worker on same GPU, then we need to decrease this number of available processes on that GPU by 1 -# E.g., master node has 4 available processes, then 1 for the ps, and worker should be set to: worker:3 -worker_ips: - - 10.0.0.1:[1] # worker_ip: [(# processes on gpu) for gpu in available_gpus] eg. 10.0.0.2:[4,4,4,4] This node has 4 gpus, each gpu has 4 processes. - -exp_path: $FEDSCALE_HOME/fedscale/cloud - -# Entry function of executor and aggregator under $exp_path -executor_entry: $FEDSCALE_HOME/examples/tensorflow_engine/tf_executor.py - -aggregator_entry: $FEDSCALE_HOME/examples/tensorflow_engine/tf_aggregator.py - -auth: - ssh_user: "" - ssh_private_key: ~/.ssh/id_rsa - -# cmd to run before we can indeed run FAR (in order) -setup_commands: - - source $HOME/anaconda3/bin/activate fedscale - -# ========== Additional job configuration ========== -# Default parameters are specified in config_parser.py, wherein more description of the parameter can be found - -job_conf: - - job_name: tf-engine # Generate logs under this folder: log_path/job_name/time_stamp - - log_path: $FEDSCALE_HOME/benchmark # Path of log files - - num_participants: 4 # Number of participants per round, we use K=100 in our paper, large K will be much slower - - data_set: cifar10 # Dataset: openImg, google_speech, stackoverflow - - data_dir: $FEDSCALE_HOME/benchmark/dataset/data/ # Path of the dataset - - model: resnet50 # Need to define the model in tf_aggregator.py - - eval_interval: 5000 # How many rounds to run a testing on the testing set - - rounds: 200 # Number of rounds to run this training. We use 1000 in our paper, while it may converge w/ ~400 rounds - - filter_less: 0 # Remove clients w/ less than 21 samples - - num_loaders: 2 - - local_steps: 20 - - learning_rate: 0.001 - - batch_size: 32 - - test_bsz: 32 - - use_cuda: False - - engine: 'tensorflow' diff --git a/benchmark/configs/tf_cifar/tf_cifar.yml b/benchmark/configs/tf_cifar/tf_cifar.yml new file mode 100644 index 00000000..cd59bec5 --- /dev/null +++ b/benchmark/configs/tf_cifar/tf_cifar.yml @@ -0,0 +1,50 @@ +# Configuration file of running tensorflow backend + +# ========== Cluster configuration ========== +# ip address of the parameter server (need 1 GPU process) +ps_ip: localhost + +# ip address of each worker:# of available gpus process on each gpu in this node +# Note that if we collocate ps and worker on same GPU, then we need to decrease this number of available processes on that GPU by 1 +# E.g., master node has 4 available processes, then 1 for the ps, and worker should be set to: worker:3 +worker_ips: + - localhost:[1] # worker_ip: [(# processes on gpu) for gpu in available_gpus] eg. 10.0.0.2:[4,4,4,4] This node has 4 gpus, each gpu has 4 processes. + +exp_path: $FEDSCALE_HOME/fedscale/cloud + +# Entry function of executor and aggregator under $exp_path +executor_entry: execution/executor.py + +aggregator_entry: aggregation/aggregator.py + +auth: + ssh_user: "" + ssh_private_key: ~/.ssh/id_rsa + +# cmd to run before we can indeed run FAR (in order) +setup_commands: + - source $HOME/anaconda3/bin/activate fedscale + +# ========== Additional job configuration ========== +# Default parameters are specified in config_parser.py, wherein more description of the parameter can be found + +job_conf: + - job_name: tf-cifar10 # Generate logs under this folder: log_path/job_name/time_stamp + - log_path: $FEDSCALE_HOME/benchmark # Path of log files + - num_participants: 4 # Number of participants per round, we use K=100 in our paper, large K will be much slower + - data_set: cifar10 # Dataset: openImg, google_speech, stackoverflow + - data_dir: $FEDSCALE_HOME/benchmark/dataset/data/ # Path of the dataset + - model: resnet50 # Need to define the model in tf_aggregator.py + - model_zoo: fedscale-tensorflow-zoo + - eval_interval: 5000 # How many rounds to run a testing on the testing set + - rounds: 200 # Number of rounds to run this training. We use 1000 in our paper, while it may converge w/ ~400 rounds + - filter_less: 0 # Remove clients w/ less than 21 samples + - num_loaders: 2 + - local_steps: 20 + - learning_rate: 0.001 + - input_shape: 32 32 3 + - batch_size: 32 + - num_classes: 10 + - test_bsz: 32 + - use_cuda: False + - engine: 'tensorflow' diff --git a/benchmark/configs/tf_femnist/tf_femnist.yml b/benchmark/configs/tf_femnist/tf_femnist.yml new file mode 100644 index 00000000..eb3b3c5a --- /dev/null +++ b/benchmark/configs/tf_femnist/tf_femnist.yml @@ -0,0 +1,50 @@ +# Configuration file of running tensorflow backend + +# ========== Cluster configuration ========== +# ip address of the parameter server (need 1 GPU process) +ps_ip: localhost + +# ip address of each worker:# of available gpus process on each gpu in this node +# Note that if we collocate ps and worker on same GPU, then we need to decrease this number of available processes on that GPU by 1 +# E.g., master node has 4 available processes, then 1 for the ps, and worker should be set to: worker:3 +worker_ips: + - localhost:[1] # worker_ip: [(# processes on gpu) for gpu in available_gpus] eg. 10.0.0.2:[4,4,4,4] This node has 4 gpus, each gpu has 4 processes. + +exp_path: $FEDSCALE_HOME/fedscale/cloud + +# Entry function of executor and aggregator under $exp_path +executor_entry: execution/executor.py + +aggregator_entry: aggregation/aggregator.py + +auth: + ssh_user: "" + ssh_private_key: ~/.ssh/id_rsa + +# cmd to run before we can indeed run FAR (in order) +setup_commands: + - source $HOME/anaconda3/bin/activate fedscale + +# ========== Additional job configuration ========== +# Default parameters are specified in config_parser.py, wherein more description of the parameter can be found + +job_conf: + - job_name: tf-femnist # Generate logs under this folder: log_path/job_name/time_stamp + - log_path: $FEDSCALE_HOME/benchmark # Path of log files + - num_participants: 4 # Number of participants per round, we use K=100 in our paper, large K will be much slower + - data_set: femnist # Dataset: openImg, google_speech, stackoverflow + - data_dir: $FEDSCALE_HOME/benchmark/dataset/data/femnist # Path of the dataset + - model: resnet50 # Need to define the model in tf_aggregator.py + - model_zoo: fedscale-tensorflow-zoo + - eval_interval: 5000 # How many rounds to run a testing on the testing set + - rounds: 200 # Number of rounds to run this training. We use 1000 in our paper, while it may converge w/ ~400 rounds + - filter_less: 0 # Remove clients w/ less than 21 samples + - num_loaders: 2 + - local_steps: 20 + - learning_rate: 0.001 + - batch_size: 32 + - input_shape: 32 32 3 + - num_classes: 62 + - test_bsz: 32 + - use_cuda: False + - engine: 'tensorflow' diff --git a/docker/driver.py b/docker/driver.py index 94ce2fdd..8e6b7b99 100644 --- a/docker/driver.py +++ b/docker/driver.py @@ -53,9 +53,9 @@ def process_cmd(yaml_file, local=False): exit(1) else: use_container = "default" - - + + ps_ip = yaml_conf['ps_ip'] worker_ips, total_gpus = [], [] cmd_script_list = [] @@ -79,7 +79,7 @@ def process_cmd(yaml_file, local=False): for conf in yaml_conf['job_conf']: job_conf.update(conf) - + conf_script = '' setup_cmd = '' if yaml_conf['setup_commands'] is not None: @@ -90,7 +90,7 @@ def process_cmd(yaml_file, local=False): cmd_sufix = f" " for conf_name in job_conf: - conf_script = conf_script + f' --{conf_name}={job_conf[conf_name]}' + conf_script = conf_script + f' --{conf_name} {job_conf[conf_name]}' if conf_name == "job_name": job_name = job_conf[conf_name] if conf_name == "log_path": @@ -152,7 +152,7 @@ def process_cmd(yaml_file, local=False): "rank_id": rank_id, "cuda_id": cuda_id } - + worker_cmd = f" docker run -i --name fedscale-exec{rank_id}-{time_stamp} --network {yaml_conf['container_network']} -p {ports[rank_id]}:32000 --mount type=bind,source={yaml_conf['data_path']},target=/FedScale/benchmark fedscale/fedscale-exec" else: worker_cmd = f" python {yaml_conf['exp_path']}/{yaml_conf['executor_entry']} {conf_script} --this_rank={rank_id} --num_executors={total_gpu_processes} --cuda_device=cuda:{cuda_id} " @@ -236,7 +236,7 @@ def process_cmd(yaml_file, local=False): msg = json.dumps(msg) send_socket.sendall(msg.encode('utf-8')) send_socket.close() - break + break print(f"Submitted job, please check your logs {job_conf['log_path']}/logs/{job_conf['job_name']}/{time_stamp} for status") @@ -258,7 +258,7 @@ def terminate(job_name): print(f"Shutting down container {name} on {meta_dict['ip']}") with open(f"{job_name}_logging", 'a') as fout: subprocess.Popen(f'ssh {job_meta["user"]}{meta_dict["ip"]} "docker rm --force {name}"', - shell=True, stdout=fout, stderr=fout) + shell=True, stdout=fout, stderr=fout) elif job_meta['use_container'] == "k8s": # for now, assume we run in k8s admin mode, placeholder for client job submission in the future config.load_kube_config() @@ -266,11 +266,11 @@ def terminate(job_name): for name, meta_dict in job_meta['k8s_dict'].items(): if os.path.exists(meta_dict["yaml_path"]): os.remove(meta_dict["yaml_path"]) - + print(f"Shutting down container {name}...") core_api.delete_namespaced_pod(name, namespace="fedscale") - else: + else: for vm_ip in job_meta['vms']: print(f"Shutting down job on {vm_ip}") with open(f"{job_name}_logging", 'a') as fout: @@ -293,7 +293,7 @@ def submit_to_k8s(yaml_conf): namespace_config = client.V1Namespace( metadata=client.V1ObjectMeta(name="fedscale")) core_api.create_namespace(namespace_config) - + time_stamp = datetime.datetime.fromtimestamp( time.time()).strftime('%m%d_%H%M%S') running_vms = set() @@ -334,7 +334,7 @@ def submit_to_k8s(yaml_conf): "rank_id": 0, "yaml_path": aggr_yaml_path } - + print(f"Submitting aggregator container {aggr_name} to k8s...") # TODO: logging? @@ -380,7 +380,7 @@ def submit_to_k8s(yaml_conf): time.sleep(1) if aggr_ip == -1: print(f"Error: aggregator {name} not ready after maximum waiting time allowed, aborting...") - exit(1) + exit(1) meta_dict["ip"] = aggr_ip elif meta_dict['type'] == 'executor': print(f'Waiting executor container {name} to be ready...') @@ -404,7 +404,7 @@ def submit_to_k8s(yaml_conf): else: print(f"Error: unrecognized type {meta_dict['type']}!") exit(1) - + # TODO: make executors init multi-threaded to boost performance for name, meta_dict in k8s_dict.items(): @@ -456,7 +456,7 @@ def submit_to_k8s(yaml_conf): msg = json.dumps(msg) send_socket.sendall(msg.encode('utf-8')) send_socket.close() - break + break else: print(f"Error: unrecognized type {meta_dict['type']}!") exit(1) @@ -468,14 +468,14 @@ def submit_to_k8s(yaml_conf): def check_log(job_name): current_path = os.path.dirname(os.path.abspath(__file__)) - job_meta_path = os.path.join(current_path, job_name) + job_meta_path = os.path.join(current_path, job_name) if not os.path.isfile(job_meta_path): print(f"Error: fail to terminate {job_name}, as it does not exist") exit(1) with open(job_meta_path, 'rb') as fin: job_meta = pickle.load(fin) - + if job_meta['use_container'] == 'k8s': for name, meta_dict in job_meta['k8s_dict'].items(): if meta_dict['type'] != 'aggregator': @@ -488,7 +488,7 @@ def check_log(job_name): else: print("Error: only support checking job logs running in k8s mode!") exit(1) - + print_help: bool = False if len(sys.argv) > 1: diff --git a/docker/dryrun b/docker/dryrun new file mode 100644 index 00000000..5f2607c9 Binary files /dev/null and b/docker/dryrun differ diff --git a/docker/tf-cifar10 b/docker/tf-cifar10 new file mode 100644 index 00000000..5f2607c9 Binary files /dev/null and b/docker/tf-cifar10 differ diff --git a/environment.yml b/environment.yml index f1dbf326..388bb302 100644 --- a/environment.yml +++ b/environment.yml @@ -16,6 +16,7 @@ dependencies: - matplotlib==3.1.3 - torch_baidu_ctc==0.3.0 - tensorboardX==2.1 + - overrides==3.1.0 - python-levenshtein==0.12.0 - pandas==1.1.0 - PyYAML diff --git a/examples/README.md b/examples/README.md index 55024af6..e3af435a 100644 --- a/examples/README.md +++ b/examples/README.md @@ -79,7 +79,7 @@ In the function `round_weight_handler`, you can customize your aggregator optimi The following code segment shows how FedYoGi and FedAvg aggregate the participant gradients. ``` -class ServerOptimizer(object): +class TorchServerOptimizer(object): def __init__(self, mode, args, device, sample_seed=233): self.mode = mode diff --git a/examples/async_fl/async_aggregator.py b/examples/async_fl/async_aggregator.py deleted file mode 100644 index 0125c197..00000000 --- a/examples/async_fl/async_aggregator.py +++ /dev/null @@ -1,509 +0,0 @@ -# -*- coding: utf-8 -*- -import collections -import os -import sys - -import torch - -import fedscale.cloud.config_parser as parser -from fedscale.cloud import commons -from fedscale.cloud.aggregation.aggregator import Aggregator -from fedscale.cloud.channels import job_api_pb2 -from fedscale.cloud.logger.aggragation import * - -sys.path.append(os.path.dirname(os.path.abspath(__file__))) -from resource_manager import ResourceManager - -MAX_MESSAGE_LENGTH = 1 * 1024 * 1024 * 1024 # 1GB - -# NOTE: We are supporting and improving the following implementation (Async FL) in FedScale: - # - "PAPAYA: Practical, Private, and Scalable Federated Learning", MLSys, 2022 - # - "Federated Learning with Buffered Asynchronous Aggregation", AISTATS, 2022 - -# We appreciate you to contribute and/or report bugs. Thank you! - -class AsyncAggregator(Aggregator): - """This centralized aggregator collects training/testing feedbacks from executors""" - - def __init__(self, args): - Aggregator.__init__(self, args) - self.resource_manager = ResourceManager(self.experiment_mode) - self.async_buffer_size = args.async_buffer - self.max_concurrency = args.max_concurrency - self.client_round_duration = {} - self.client_start_time = collections.defaultdict(list) - self.round_stamp = [0] - self.client_model_version = collections.defaultdict(list) - self.virtual_client_clock = {} - self.weight_tensor_type = {} - - # We need to keep the test model for specific round to avoid async mismatch - self.test_model = None - self.aggregate_update = {} - self.importance_sum = 0 - self.client_end = [] - self.round_staleness = [] - self.round_tasks_issued = 0 - # self.model_concurrency = collections.defaultdict(int) - - def run(self): - """Start running the aggregator server by setting up execution - and communication environment, and monitoring the grpc message. - """ - self.setup_env() - self.init_control_communication() - self.init_data_communication() - - self.init_model() - self.save_last_param() - self.model_update_size = sys.getsizeof( - pickle.dumps(self.model)) / 1024.0 * 8. # kbits - self.client_profiles = self.load_client_profile( - file_path=self.args.device_conf_file) - - self.event_monitor() - - - def tictak_client_tasks(self, sampled_clients, num_clients_to_collect): - - if self.experiment_mode == commons.SIMULATION_MODE: - # NOTE: We try to remove dummy events as much as possible in simulations, - # by removing the stragglers/offline clients in overcommitment""" - sampledClientsReal = [] - startTimes = [] - endTimes = [] - completed_client_clock = {} - - start_time = self.global_virtual_clock - constant_checkin_period = self.args.arrival_interval - # 1. remove dummy clients that are not available to the end of training - concurreny_count = 0 - - end_list = [] - end_j = 0 - for client_to_run in sampled_clients: - client_cfg = self.client_conf.get(client_to_run, self.args) - exe_cost = self.client_manager.get_completion_time(client_to_run, - batch_size=client_cfg.batch_size, local_steps=client_cfg.local_steps, - upload_size=self.model_update_size, download_size=self.model_update_size) - - roundDuration = exe_cost['computation'] + \ - exe_cost['communication'] - # if the client is not active by the time of collection, we consider it is lost in this round - start_time += constant_checkin_period - end_time = roundDuration + start_time - end_list.append(end_time) - while start_time > end_list[end_j]: - concurreny_count -= 1 - end_j += 1 - if concurreny_count > self.max_concurrency: - end_list.pop() - continue - - if self.client_manager.isClientActive(client_to_run, end_time): - concurreny_count += 1 - sampledClientsReal.append(client_to_run) - completed_client_clock[client_to_run] = exe_cost - startTimes.append(start_time) - self.client_start_time[client_to_run].append(start_time) - self.client_round_duration[client_to_run] = roundDuration - endTimes.append(end_time) - - num_clients_to_collect = min( - num_clients_to_collect, len(sampledClientsReal)) - # 2. sort & execute clients based on completion time - sortedWorkersByCompletion = sorted( - range(len(endTimes)), key=lambda k: endTimes[k]) - top_k_index = sortedWorkersByCompletion[:num_clients_to_collect] - clients_to_run = [sampledClientsReal[k] for k in top_k_index] - endTimes = [endTimes[k] for k in top_k_index] - return (clients_to_run, - endTimes, - completed_client_clock) # dict : string the speed for each client - - else: - completed_client_clock = { - client: {'computation': 1, 'communication': 1} for client in sampled_clients} - completionTimes = [1 for c in sampled_clients] - return (sampled_clients, sampled_clients, completed_client_clock, - 1, completionTimes) - - def save_last_param(self): - """ Save the last model parameters - """ - self.last_gradient_weights = [ - p.data.clone() for p in self.model.parameters()] - self.model_weights = copy.deepcopy(self.model.state_dict()) - self.weight_tensor_type = {p: self.model_weights[p].data.dtype \ - for p in self.model_weights} - - def aggregate_client_weights(self, results): - """May aggregate client updates on the fly""" - """ - "PAPAYA: PRACTICAL, PRIVATE, AND SCALABLE FEDERATED LEARNING". MLSys, 2022 - """ - # Start to take the average of updates, and we do not keep updates to save memory - # Importance of each update is 1/staleness - client_staleness = self.round - self.client_model_version[results['clientId']].pop(0) - - importance = 1./(math.sqrt(1 + client_staleness)) - self.round_staleness.append(client_staleness) - - new_round_aggregation = (self.model_in_update == 1) - if new_round_aggregation: - self.importance_sum = 0 - self.importance_sum += importance - - for p in results['update_weight']: - # Different to cloud/executor, update_weight here is (train_model_weight - untrained) - param_weight = results['update_weight'][p] - - if isinstance(param_weight, list): - param_weight = np.asarray(param_weight, dtype=np.float32) - param_weight = torch.from_numpy( - param_weight).to(device=self.device) - - if new_round_aggregation: - self.aggregate_update[p] = param_weight * importance - else: - self.aggregate_update[p] += param_weight * importance - - if self.model_in_update == self.async_buffer_size: - for p in self.model_weights: - d_type = self.weight_tensor_type[p] - self.model_weights[p].data = ( - self.model_weights[p].data + self.aggregate_update[p] / float(self.importance_sum) # self.model_in_update - ).to(dtype=d_type) - - def round_completion_handler(self): - self.round += 1 - - logging.info(f"Round {self.round} average staleness {np.mean(self.round_staleness)}") - self.round_staleness = [] - self.global_virtual_clock = self.round_stamp[-1] - - if self.round % self.args.decay_round == 0: - self.args.learning_rate = max( - self.args.learning_rate * self.args.decay_factor, self.args.min_learning_rate) - - # handle the global update w/ current and last - self.round_weight_handler(self.last_gradient_weights) - - avg_loss = sum(self.loss_accumulator) / \ - max(1, len(self.loss_accumulator)) - logging.info(f"Wall clock: {round(self.global_virtual_clock)} s, round: {self.round}, asyn running participants: " + - f"{self.resource_manager.get_task_length()}, aggregating {len(self.stats_util_accumulator)} participants, " + - f"training loss: {avg_loss}") - - # dump round completion information to tensorboard - if len(self.loss_accumulator): - self.log_train_result(avg_loss) - - # update select participants - # NOTE: we simulate async, while have to sync every 10 rounds to avoid large division to trace - if self.resource_manager.get_task_length() < self.async_buffer_size * 5: - - self.sampled_participants = self.select_participants( - select_num_participants=self.async_buffer_size*10, overcommitment=self.args.overcommitment) - (clientsToRun, clientsEndTime, virtual_client_clock) = self.tictak_client_tasks( - self.sampled_participants, len(self.sampled_participants)) - - logging.info(f"{len(clientsToRun)} clients with constant arrival following the order: {clientsToRun}") - - # Issue requests to the resource manager; Tasks ordered by the completion time - self.resource_manager.register_tasks(clientsToRun, clientsEndTime) - self.virtual_client_clock.update(virtual_client_clock) - - # Update executors and participants - if self.experiment_mode == commons.SIMULATION_MODE: - self.sampled_executors = list(self.individual_client_events.keys()) - else: - self.sampled_executors = [str(c_id) - for c_id in self.sampled_participants] - - self.save_last_param() - #self.round_stragglers = round_stragglers - - self.model_in_update = 0 - self.test_result_accumulator = [] - self.stats_util_accumulator = [] - self.client_training_results = [] - self.loss_accumulator = [] - # self.round_stamp.append(self.global_virtual_clock) - - if self.round >= self.args.rounds: - self.broadcast_aggregator_events(commons.SHUT_DOWN) - elif self.round % self.args.eval_interval == 0: - self.test_model = copy.deepcopy(self.model) - self.broadcast_aggregator_events(commons.UPDATE_MODEL) - self.broadcast_aggregator_events(commons.MODEL_TEST) - else: - self.broadcast_aggregator_events(commons.UPDATE_MODEL) - self.broadcast_aggregator_events(commons.START_ROUND) - - def find_latest_model(self, start_time): - for i, time_stamp in enumerate(reversed(self.round_stamp)): - if start_time >= time_stamp: - return len(self.round_stamp) - i - return 1 - - def get_test_config(self, client_id): - """FL model testing on clients, developers can further define personalized client config here. - - Args: - client_id (int): The client id. - - Returns: - dictionary: The testing config for new task. - - """ - # Get the straggler round-id - client_tasks = self.resource_manager.client_run_queue - current_pending_length = min( - self.resource_manager.client_run_queue_idx, len(client_tasks)-1) - - current_pending_clients = client_tasks[current_pending_length:] - straggler_round = 1e10 - for client in current_pending_clients: - straggler_round = min( - self.find_latest_model(self.client_start_time[client][0]), straggler_round) - - return {'client_id': client_id, - 'straggler_round': straggler_round, - 'test_model': self.test_model} - - def get_client_conf(self, clientId): - """Training configurations that will be applied on clients""" - conf = { - 'learning_rate': self.args.learning_rate, - } - return conf - - def create_client_task(self, executorId): - """Issue a new client training task to the executor""" - - train_config = None - model = None - - # NOTE: in batch execution simulation (i.e., multiple executors), we need to stall task scheduling - # to ensure clients in current async_buffer_size completes ahead of other tasks - with self.update_lock: - logging.info(f"====self.round_tasks_issued ({executorId}) is {self.round_tasks_issued}, {self.async_buffer_size}") - if self.round_tasks_issued < self.async_buffer_size: - next_clientId = self.resource_manager.get_next_task(executorId) - config = self.get_client_conf(next_clientId) - start_time = self.client_start_time[next_clientId][0] - end_time = self.client_round_duration[next_clientId] + start_time - model_id = self.find_latest_model(start_time) - - self.client_model_version[next_clientId].append(model_id) - - # The executor has already received the model, thus sending id is enough - model = model_id - train_config = {'client_id': next_clientId, 'task_config': config, 'end_time': end_time} - logging.info( - f"Client {next_clientId} train on model {model_id} during {int(start_time)}-{int(end_time)}") - - self.round_tasks_issued += 1 - - - return train_config, model - - def log_train_result(self, avg_loss): - """Result will be post on TensorBoard""" - self.log_writer.add_scalar('Train/round_to_loss', avg_loss, self.round) - self.log_writer.add_scalar( - 'FAR/time_to_train_loss (min)', avg_loss, self.global_virtual_clock / 60.) - self.log_writer.add_scalar( - 'FAR/round_duration (min)', self.round_duration / 60., self.round) - - def client_completion_handler(self, results): - """We may need to keep all updates from clients, - if so, we need to append results to the cache - - Args: - results (dictionary): client's training result - - """ - # Format: - # -results = {'clientId':clientId, 'update_weight': model_param, 'moving_loss': round_train_loss, - # 'trained_size': count, 'wall_duration': time_cost, 'success': is_success 'utility': utility} - - if self.round - self.client_model_version[results['clientId']][0] > self.args.max_staleness: - logging.info(f"Warning: Ignore stale client {results['clientId']} with {self.round - self.client_model_version[results['clientId']][0]}") - self.client_model_version[results['clientId']].pop(0) - self.client_start_time[results['clientId']].pop(0) - with self.update_lock: - self.round_tasks_issued -= 1 - # self.individual_client_events['1'].append( commons.CLIENT_TRAIN) - return -1 - - # [ASYNC] New checkin clients ID would overlap with previous unfinished clients - logging.info( - f"Client {results['clientId']} completes from {self.client_start_time[results['clientId']][0]} " + - f"to {self.client_start_time[results['clientId']][0]+self.client_round_duration[results['clientId']]}") - - self.client_end.append(self.client_round_duration[results['clientId']] + self.client_start_time[results['clientId']].pop(0)) - - if self.args.gradient_policy in ['q-fedavg']: - self.client_training_results.append(results) - # Feed metrics to client sampler - self.stats_util_accumulator.append(results['utility']) - self.loss_accumulator.append(results['moving_loss']) - - self.client_manager.register_feedback(results['clientId'], results['utility'], - auxi=math.sqrt( - results['moving_loss']), - time_stamp=self.round, - duration=self.virtual_client_clock[results['clientId']]['computation'] + - self.virtual_client_clock[results['clientId']]['communication'] - ) - - # ================== Aggregate weights ====================== - with self.update_lock: - self.model_in_update += 1 - if self.using_group_params == True: - self.aggregate_client_group_weights(results) - else: - self.aggregate_client_weights(results) - - return 0 - - def CLIENT_EXECUTE_COMPLETION(self, request, context): - """FL clients complete the execution task. - - Args: - request (CompleteRequest): Complete request info from executor. - - Returns: - ServerResponse: Server response to job completion request - - """ - - executor_id, client_id, event = request.executor_id, request.client_id, request.event - execution_status, execution_msg = request.status, request.msg - meta_result, data_result = request.meta_result, request.data_result - - if event == commons.CLIENT_TRAIN: - # Training results may be uploaded in CLIENT_EXECUTE_RESULT request later, - # so we need to specify whether to ask client to do so (in case of straggler/timeout in real FL). - if execution_status is False: - logging.error(f"Executor {executor_id} fails to run client {client_id}, due to {execution_msg}") - - elif event in (commons.MODEL_TEST, commons.UPLOAD_MODEL): - self.add_event_handler( - executor_id, event, meta_result, data_result) - else: - logging.error(f"Received undefined event {event} from client {client_id}") - - # [ASYNC] Different from sync that only schedule tasks once previous training finish - if self.resource_manager.has_next_task(executor_id) and self.round_tasks_issued < self.async_buffer_size: - # NOTE: we do not pop the train immediately in simulation mode, - # since the executor may run multiple clients - if commons.CLIENT_TRAIN not in self.individual_client_events[executor_id] : - # if event in (commons.MODEL_TEST, commons.UPLOAD_MODEL): - self.individual_client_events[executor_id].append( - commons.CLIENT_TRAIN) - - return self.CLIENT_PING(request, context) - - def CLIENT_PING(self, request, context): - """Handle client ping requests - - Args: - request (PingRequest): Ping request info from executor. - - Returns: - ServerResponse: Server response to ping request - - """ - # NOTE: client_id = executor_id in deployment, - # while multiple client_id may use the same executor_id (VMs) in simulations - executor_id, client_id = request.executor_id, request.client_id - response_data = response_msg = commons.DUMMY_RESPONSE - if len(self.individual_client_events[executor_id]) == 0: - # send dummy response - current_event = commons.DUMMY_EVENT - response_data = response_msg = commons.DUMMY_RESPONSE - else: - logging.info(f"====event queue {executor_id}, {self.individual_client_events[executor_id]}") - current_event = self.individual_client_events[executor_id].popleft() - if current_event == commons.CLIENT_TRAIN: - response_msg, response_data = self.create_client_task( - executor_id) - if response_msg is None: - current_event = commons.DUMMY_EVENT - if self.experiment_mode != commons.SIMULATION_MODE: - self.individual_client_events[executor_id].append( - commons.CLIENT_TRAIN) - elif current_event == commons.MODEL_TEST: - response_msg = self.get_test_config(client_id) - elif current_event == commons.UPDATE_MODEL: - response_data = self.get_global_model() - elif current_event == commons.SHUT_DOWN: - response_msg = self.get_shutdown_config(executor_id) - - response_msg, response_data = self.serialize_response( - response_msg), self.serialize_response(response_data) - # NOTE: in simulation mode, response data is pickle for faster (de)serialization - response = job_api_pb2.ServerResponse(event=current_event, - meta=response_msg, data=response_data) - if current_event != commons.DUMMY_EVENT: - logging.info(f"Issue EVENT ({current_event}) to EXECUTOR ({executor_id})") - - return response - - - def event_monitor(self): - logging.info("Start monitoring events ...") - - while True: - # Broadcast events to clients - if len(self.broadcast_events_queue) > 0: - current_event = self.broadcast_events_queue.popleft() - - if current_event in (commons.UPDATE_MODEL, commons.MODEL_TEST): - self.dispatch_client_events(current_event) - - elif current_event == commons.START_ROUND: - self.dispatch_client_events(commons.CLIENT_TRAIN) - - elif current_event == commons.SHUT_DOWN: - self.dispatch_client_events(commons.SHUT_DOWN) - break - - # Handle events queued on the aggregator - elif len(self.sever_events_queue) > 0: - client_id, current_event, meta, data = self.sever_events_queue.popleft() - - if current_event == commons.UPLOAD_MODEL: - state = self.client_completion_handler( - self.deserialize_response(data)) - logging.info( - f"Executor ({client_id}) finish client {self.deserialize_response(data)['clientId']} in round {self.round} [{self.model_in_update}/{ self.async_buffer_size}] ") - if state == -1 : - self.individual_client_events[client_id].append(commons.CLIENT_TRAIN) - - elif self.model_in_update == self.async_buffer_size: - # clientID = self.deserialize_response(data)['clientId'] - - # [ASYNC] handle different completion order - self.round_stamp.append(max(self.client_end)) - self.client_end = [] - self.round_completion_handler() - with self.update_lock: self.round_tasks_issued = 0 - - elif current_event == commons.MODEL_TEST: - self.testing_completion_handler( - client_id, self.deserialize_response(data)) - - else: - logging.error(f"Event {current_event} is not defined") - - else: - # execute every 100 ms - time.sleep(0.1) - -if __name__ == "__main__": - aggregator = AsyncAggregator(parser.args) - aggregator.run() diff --git a/examples/async_fl/async_client.py b/examples/async_fl/async_client.py deleted file mode 100644 index 18e2394d..00000000 --- a/examples/async_fl/async_client.py +++ /dev/null @@ -1,66 +0,0 @@ -import copy -import logging -import math -import pickle - -import torch -from torch.autograd import Variable - -from fedscale.cloud.execution.client import Client -from fedscale.cloud.execution.optimizers import ClientOptimizer -from fedscale.dataloaders.nlp import mask_tokens - - -class Client(Client): - """Basic client component in Federated Learning""" - - def train(self, client_data, model, conf): - - clientId = conf.clientId - logging.info(f"Start to train (CLIENT: {clientId}) ...") - tokenizer, device = conf.tokenizer, conf.device - - model = model.to(device=device) - model.train() - - trained_unique_samples = min( - len(client_data.dataset), conf.local_steps * conf.batch_size) - - self.global_model = None - if conf.gradient_policy == 'fed-prox': - # could be move to optimizer - self.global_model = [param.data.clone() for param in model.parameters()] - - prev_model_dict = copy.deepcopy(model.state_dict()) - optimizer = self.get_optimizer(model, conf) - criterion = self.get_criterion(conf) - error_type = None - - # NOTE: One may hope to run fixed number of epochs, instead of iterations - # then replace the following with "while self.completed_steps < conf.local_steps * len(client_data)" - while self.completed_steps < conf.local_steps: - try: - self.train_step(client_data, conf, model, optimizer, criterion) - except Exception as ex: - error_type = ex - break - - state_dicts = model.state_dict() - # In async, we need the delta_weight only - model_param = {p: (state_dicts[p] - prev_model_dict[p]).data.cpu().numpy() - for p in state_dicts} - results = {'clientId': clientId, 'moving_loss': self.epoch_train_loss, - 'trained_size': self.completed_steps*conf.batch_size, - 'success': self.completed_steps == conf.local_steps} - - if error_type is None: - logging.info(f"Training of (CLIENT: {clientId}) completes, {results}") - else: - logging.info(f"Training of (CLIENT: {clientId}) failed as {error_type}") - - results['utility'] = math.sqrt( - self.loss_squre)*float(trained_unique_samples) - results['update_weight'] = model_param - results['wall_duration'] = 0 - - return results diff --git a/examples/async_fl/async_executor.py b/examples/async_fl/async_executor.py deleted file mode 100644 index fea88dad..00000000 --- a/examples/async_fl/async_executor.py +++ /dev/null @@ -1,172 +0,0 @@ -# -*- coding: utf-8 -*- -import pickle - -import fedscale.cloud.channels.job_api_pb2 as job_api_pb2 -import fedscale.cloud.config_parser as parser -from fedscale.cloud.execution.executor import Executor -from fedscale.cloud.execution.rlclient import RLClient -from fedscale.cloud.logger.execution import * -from fedscale.cloud import commons - -sys.path.append(os.path.dirname(os.path.abspath(__file__))) -from async_client import Client as CustomizedClient - -class AsyncExecutor(Executor): - """Each executor takes certain resource to run real training. - Each run simulates the execution of an individual client""" - - def __init__(self, args): - super().__init__(args) - self.temp_model_path_version = lambda round: os.path.join( - logDir, f'model_{self.this_rank}_{round}.pth.tar') - - def update_model_handler(self, model): - """Update the model copy on this executor""" - self.round += 1 - - # Dump latest model to disk - with open(self.temp_model_path_version(self.round), 'wb') as model_out: - logging.info( - f"Received latest model saved at {self.temp_model_path_version(self.round)}" - ) - pickle.dump(model, model_out) - - def load_global_model(self, round=None): - # load last global model - # logging.info(f"====Load global model with version {round}") - round = min(round, self.round) if round is not None else self.round - with open(self.temp_model_path_version(round), 'rb') as model_in: - model = pickle.load(model_in) - return model - - def get_client_trainer(self, conf): - return CustomizedClient(conf) - - def training_handler(self, clientId, conf, model=None): - """Train model given client ids""" - - # Here model is model_id - client_model = self.load_global_model(model) - - conf.clientId, conf.device = clientId, self.device - conf.tokenizer = tokenizer - if args.task == "rl": - client_data = self.training_sets - client = RLClient(conf) - train_res = client.train( - client_data=client_data, model=client_model, conf=conf) - else: - client_data = select_dataset(clientId, self.training_sets, - batch_size=conf.batch_size, args=self.args, - collate_fn=self.collate_fn - ) - - client = self.get_client_trainer(conf) - train_res = client.train( - client_data=client_data, model=client_model, conf=conf) - - return train_res - - def testing_handler(self, args, config=None): - - evalStart = time.time() - device = self.device - model = config['test_model'] - if self.task == 'rl': - client = RLClient(args) - test_res = client.test(args, self.this_rank, model, device=device) - _, _, _, testResults = test_res - else: - data_loader = select_dataset(self.this_rank, self.testing_sets, - batch_size=args.test_bsz, args=args, - isTest=True, collate_fn=self.collate_fn - ) - - if self.task == 'voice': - criterion = CTCLoss(reduction='mean').to(device=device) - else: - criterion = torch.nn.CrossEntropyLoss().to(device=device) - - if self.args.engine == commons.PYTORCH: - test_res = test_model(self.this_rank, model, data_loader, - device=device, criterion=criterion, tokenizer=tokenizer) - else: - raise Exception(f"Need customized implementation for model testing in {self.args.engine} engine") - - test_loss, acc, acc_5, testResults = test_res - logging.info("After aggregation round {}, CumulTime {}, eval_time {}, test_loss {}, test_accuracy {:.2f}%, test_5_accuracy {:.2f}% \n" - .format(self.round, round(time.time() - self.start_run_time, 4), round(time.time() - evalStart, 4), test_loss, acc*100., acc_5*100.)) - - gc.collect() - - return testResults - - def check_model_version(self, model_id): - return os.path.exists(self.temp_model_path_version(model_id)) - - def remove_stale_models(self, straggler_round): - """Remove useless models kept for async execution in the past""" - logging.info(f"Current straggler round is {straggler_round}") - stale_version = straggler_round-1 - while self.check_model_version(stale_version): - logging.info(f"Executor {self.this_rank} removes stale model version {stale_version}") - os.remove(self.temp_model_path_version(stale_version)) - stale_version -= 1 - - def event_monitor(self): - """Activate event handler once receiving new message - """ - logging.info("Start monitoring events ...") - self.client_register() - - while self.received_stop_request == False: - if len(self.event_queue) > 0: - request = self.event_queue.popleft() - current_event = request.event - - logging.info(f"====Poping event {current_event}") - if current_event == commons.CLIENT_TRAIN: - train_config = self.deserialize_response(request.meta) - train_model = self.deserialize_response(request.data) - if train_model is not None and not self.check_model_version(train_model): - # The executor may have not received the model due to async grpc - # TODO: server will lose track of scheduled but not executed task and remove the model - logging.error(f"Warning: Not receive model {train_model} for client {train_config['client_id'] }") - if self.round - train_model <= self.args.max_staleness: - self.event_queue.append(request) - time.sleep(1) - continue - - train_config['model'] = train_model - train_config['client_id'] = int(train_config['client_id']) - client_id, train_res = self.Train(train_config) - - # Upload model updates - future_call = self.aggregator_communicator.stub.CLIENT_EXECUTE_COMPLETION.future( - job_api_pb2.CompleteRequest(client_id=str(client_id), executor_id=self.executor_id, - event=commons.UPLOAD_MODEL, status=True, msg=None, - meta_result=None, data_result=self.serialize_response(train_res) - )) - future_call.add_done_callback(lambda _response: self.dispatch_worker_events(_response.result())) - - elif current_event == commons.MODEL_TEST: - test_configs = self.deserialize_response(request.meta) - self.remove_stale_models(test_configs['straggler_round']) - self.Test(test_configs) - - elif current_event == commons.UPDATE_MODEL: - broadcast_config = self.deserialize_response(request.data) - self.UpdateModel(broadcast_config) - - elif current_event == commons.SHUT_DOWN: - self.Stop() - - elif current_event == commons.DUMMY_EVENT: - pass - else: - time.sleep(1) - self.client_ping() - -if __name__ == "__main__": - executor = AsyncExecutor(parser.args) - executor.run() diff --git a/examples/async_fl/resource_manager.py b/examples/async_fl/resource_manager.py deleted file mode 100644 index 15f6908d..00000000 --- a/examples/async_fl/resource_manager.py +++ /dev/null @@ -1,53 +0,0 @@ -import threading - -from fedscale.cloud import commons -from fedscale.cloud.resource_manager import ResourceManager as DefaultManager -import numpy as np - -class ResourceManager(DefaultManager): - """Schedule training tasks across GPUs/CPUs""" - - def __init__(self, experiment_mode): - super().__init__(experiment_mode) - self.client_run_queue = [] - self.experiment_mode = experiment_mode - self.update_lock = threading.Lock() - self.client_end_queue = [] - - def get_task_length(self): - self.update_lock.acquire() - remaining_task_num: int = len(self.client_run_queue) - self.update_lock.release() - return remaining_task_num - - def register_tasks(self, clientsToRun, clientsEndTime): - self.client_end_queue += clientsEndTime.copy() - self.client_run_queue += clientsToRun.copy() - sortedClientIndex = np.argsort(self.client_end_queue) - - self.client_run_queue = [self.client_run_queue[k] for k in sortedClientIndex] - self.client_end_queue = [self.client_end_queue[k] for k in sortedClientIndex] - - def has_next_task(self, client_id=None): - exist_next_task = False - if self.experiment_mode == commons.SIMULATION_MODE: - exist_next_task = len(self.client_run_queue) > 0 - else: - exist_next_task = client_id in self.client_run_queue - return exist_next_task - - def get_next_task(self, client_id=None): - next_task_id = None - self.update_lock.acquire() - if self.experiment_mode == commons.SIMULATION_MODE: - if self.has_next_task(client_id): - next_task_id = self.client_run_queue[0] - self.client_run_queue.pop(0) - self.client_end_queue.pop(0) - else: - if client_id in self.client_run_queue: - next_task_id = client_id - self.client_run_queue.remove(next_task_id) - - self.update_lock.release() - return next_task_id diff --git a/examples/differential_privacy/customized_client.py b/examples/differential_privacy/customized_client.py index 970b992f..b3568dad 100644 --- a/examples/differential_privacy/customized_client.py +++ b/examples/differential_privacy/customized_client.py @@ -8,10 +8,10 @@ from clip_norm import clip_grad_norm_ from torch.autograd import Variable -from fedscale.cloud.execution.client import Client +from fedscale.cloud.execution.torch_client import TorchClient -class Customized_Client(Client): +class Customized_Client(TorchClient): """ Basic client component in Federated Learning Local differential privacy @@ -19,8 +19,8 @@ class Customized_Client(Client): def train(self, client_data, model, conf): - clientId = conf.clientId - logging.info(f"Start to train (CLIENT: {clientId}) ...") + client_id = conf.client_id + logging.info(f"Start to train (CLIENT: {client_id}) ...") tokenizer, device = conf.tokenizer, conf.device last_model_params = [p.data.clone() for p in model.parameters()] @@ -65,15 +65,15 @@ def train(self, client_data, model, conf): torch.normal(mean=0, std=sigma, size=state_dicts[p].data.shape).cpu().numpy()) for p in state_dicts} - results = {'clientId': clientId, 'moving_loss': self.epoch_train_loss, + results = {'client_id': client_id, 'moving_loss': self.epoch_train_loss, 'trained_size': self.completed_steps*conf.batch_size, 'success': self.completed_steps > 0} results['utility'] = math.sqrt( - self.loss_squre)*float(trained_unique_samples) + self.loss_squared)*float(trained_unique_samples) if error_type is None: - logging.info(f"Training of (CLIENT: {clientId}) completes, {results}") + logging.info(f"Training of (CLIENT: {client_id}) completes, {results}") else: - logging.info(f"Training of (CLIENT: {clientId}) failed as {error_type}") + logging.info(f"Training of (CLIENT: {client_id}) failed as {error_type}") results['update_weight'] = model_param results['wall_duration'] = 0 diff --git a/examples/differential_privacy/customized_executor.py b/examples/differential_privacy/customized_executor.py index 94966bb0..e113ca79 100644 --- a/examples/differential_privacy/customized_executor.py +++ b/examples/differential_privacy/customized_executor.py @@ -8,7 +8,7 @@ from fedscale.cloud.execution.executor import Executor import fedscale.cloud.config_parser as parser -"""In this example, we only need to change the Client Component we need to import""" +"""In this example, we only need to change the TorchClient Component we need to import""" class Customized_Executor(Executor): """Each executor takes certain resource to run real training. diff --git a/examples/dry_run/customized_client.py b/examples/dry_run/customized_client.py index cde31a10..4ead6fa6 100644 --- a/examples/dry_run/customized_client.py +++ b/examples/dry_run/customized_client.py @@ -7,17 +7,17 @@ import torch from torch.autograd import Variable -from fedscale.cloud.execution.client import Client +from fedscale.cloud.execution.torch_client import TorchClient -class Customized_Client(Client): +class Customized_Client(TorchClient): """Basic client component in Federated Learning""" def train(self, client_data, model, conf): """We flip the label of the malicious client""" - clientId = conf.clientId + client_id = conf.client_id - logging.info(f"Start to train (CLIENT: {clientId}) ...") + logging.info(f"Start to train (CLIENT: {client_id}) ...") device = conf.device model = model.to(device=device) @@ -62,14 +62,14 @@ def train(self, client_data, model, conf): state_dicts = model.state_dict() model_param = {p:state_dicts[p].data.cpu().numpy() for p in state_dicts} - results = {'clientId':clientId, 'moving_loss': epoch_train_loss, + results = {'client_id':client_id, 'moving_loss': epoch_train_loss, 'trained_size': completed_steps*conf.batch_size, 'success': completed_steps > 0} results['utility'] = math.sqrt(epoch_train_loss)*float(trained_unique_samples) if error_type is None: - logging.info(f"Training of (CLIENT: {clientId}) completes, {results}") + logging.info(f"Training of (CLIENT: {client_id}) completes, {results}") else: - logging.info(f"Training of (CLIENT: {clientId}) failed as {error_type}") + logging.info(f"Training of (CLIENT: {client_id}) failed as {error_type}") results['update_weight'] = model_param results['wall_duration'] = 0 diff --git a/examples/dry_run/customized_executor.py b/examples/dry_run/customized_executor.py index 2741a0fb..bd936810 100644 --- a/examples/dry_run/customized_executor.py +++ b/examples/dry_run/customized_executor.py @@ -8,7 +8,7 @@ from fedscale.cloud.execution.executor import Executor import fedscale.cloud.config_parser as parser -"""In this example, we only need to change the Client Component we need to import""" +"""In this example, we only need to change the TorchClient Component we need to import""" class Customized_Executor(Executor): """Each executor takes certain resource to run real training. diff --git a/examples/heterofl/customized_aggregator.py b/examples/heterofl/customized_aggregator.py index ff493a1c..0747f8b4 100644 --- a/examples/heterofl/customized_aggregator.py +++ b/examples/heterofl/customized_aggregator.py @@ -1,13 +1,14 @@ -import os -import sys +import math +import random +from collections import OrderedDict + +import torch import config import customized_fllibs from customized_fllibs import make_param_idx - -import fedscale.cloud.config_parser as parser from fedscale.cloud.aggregation.aggregator import Aggregator -from fedscale.cloud.logger.aggragation import * +from fedscale.cloud.logger.aggregation_logging import * class Customized_Aggregator(Aggregator): @@ -55,14 +56,14 @@ def client_completion_handler(self, results): self.client_training_results.append(results) self.stats_util_accumulator.append(results['utility']) self.loss_accumulator.append(results['moving_loss']) - self.client_manager.registerScore(results['clientId'], results['utility'], auxi=math.sqrt(results['moving_loss']), + self.client_manager.registerScore(results['client_id'], results['utility'], auxi=math.sqrt(results['moving_loss']), time_stamp=self.epoch, - duration=self.virtual_client_clock[results['clientId']]['computation']+self.virtual_client_clock[results['clientId']]['communication'] + duration=self.virtual_client_clock[results['client_id']]['computation']+self.virtual_client_clock[results['client_id']]['communication'] ) self.update_lock.acquire() self.model_in_update += 1 - + if self.model_in_update == self.tasks_round: self.combine_models() @@ -115,9 +116,9 @@ def combine_models(self): count[k] += 1 tmp_v[count[k] > 0] = tmp_v[count[k] > 0].div_(count[k][count[k] > 0]) v[count[k] > 0] = tmp_v[count[k] > 0].to(v.dtype) - return - - + return + + if __name__ == "__main__": aggregator = Customized_Aggregator(parser.args) aggregator.run() \ No newline at end of file diff --git a/examples/heterofl/customized_client.py b/examples/heterofl/customized_client.py index 9f5a58a1..14e4ebf0 100644 --- a/examples/heterofl/customized_client.py +++ b/examples/heterofl/customized_client.py @@ -5,17 +5,17 @@ from customized_fllibs import split_model from resnet_heterofl import resnet18 -from fedscale.cloud.execution.client import Client +from fedscale.cloud.execution.torch_client import TorchClient from fedscale.cloud.fllibs import Variable, logging, math, np, os, torch -class Customized_Client(Client): +class Customized_Client(TorchClient): def __init__(self, conf): super().__init__(conf) self.model_rate = None self.param_idx = None self.local_parameters = None - + def make_model_rate(self): """get the model scaling rate""" @@ -23,7 +23,7 @@ def make_model_rate(self): self.model_rate = np.random.choice(config.cfg['shrinkage']) elif config.cfg['model_split_mode'] == 'fix': for i in range(len(config.cfg['model_rate'])): - if self.clientId % sum(config.cfg['proportion_of_model']) < \ + if self.client_id % sum(config.cfg['proportion_of_model']) < \ sum(config.cfg['proportion_of_model'][:i+1]): self.model_rate = config.cfg['model_rate'][i] break @@ -31,13 +31,13 @@ def make_model_rate(self): def train(self, client_data, model, conf): - self.clientId = conf.clientId + self.client_id = conf.client_id self.make_model_rate() - logging.info(f"Start to split model (CLIENT: {self.clientId}, MODEL RATE: {self.model_rate}) ...") + logging.info(f"Start to split model (CLIENT: {self.client_id}, MODEL RATE: {self.model_rate}) ...") self.local_parameters = split_model(model, self.model_rate) self.local_model = resnet18(model_rate=self.model_rate) self.local_model.load_state_dict(self.local_parameters) - logging.info(f"Start to train (CLIENT: {self.clientId}) ...") + logging.info(f"Start to train (CLIENT: {self.client_id}) ...") device = conf.device self.local_model = self.local_model.to(device=device) self.local_model.train(True) @@ -47,7 +47,7 @@ def train(self, client_data, model, conf): epoch_train_loss = 1e-4 error_type = None completed_steps = 0 - loss_squre = 0 + loss_squared = 0 completed_steps = 0 while completed_steps < config.cfg['local_epochs']: try: @@ -63,7 +63,7 @@ def train(self, client_data, model, conf): loss_list = loss.tolist() loss = loss.mean() temp_loss = sum(loss_list)/float(len(loss_list)) - loss_squre = sum([l**2 for l in loss_list])/float(len(loss_list)) + loss_squared = sum([l**2 for l in loss_list])/float(len(loss_list)) if completed_steps < len(client_data): if epoch_train_loss == 1e-4: epoch_train_loss = temp_loss @@ -73,20 +73,20 @@ def train(self, client_data, model, conf): loss.backward() torch.nn.utils.clip_grad_norm_(self.local_model.parameters(), 1) optimizer.step() - logging.info(f"Client {self.clientId} completes local epoch: {completed_steps}, loss square: {loss_squre}") + logging.info(f"Client {self.client_id} completes local epoch: {completed_steps}, loss square: {loss_squared}") completed_steps += 1 except Exception as ex: error_type = ex break - results = {'clientId':self.clientId, 'moving_loss': epoch_train_loss, + results = {'client_id':self.client_id, 'moving_loss': epoch_train_loss, 'trained_size': completed_steps*conf.batch_size, 'success': completed_steps > 0} - results['utility'] = math.sqrt(loss_squre)*float(trained_unique_samples) + results['utility'] = math.sqrt(loss_squared)*float(trained_unique_samples) if error_type is None: - logging.info(f"Training of (CLIENT: {self.clientId}) completes, {results}") + logging.info(f"Training of (CLIENT: {self.client_id}) completes, {results}") else: - logging.info(f"Training of (CLIENT: {self.clientId}) failed as {error_type}") + logging.info(f"Training of (CLIENT: {self.client_id}) failed as {error_type}") results['wall_duration'] = 0 results['model_rate'] = self.model_rate diff --git a/examples/poisoning_setting/customized_client.py b/examples/poisoning_setting/customized_client.py index 032a3936..655c6abe 100644 --- a/examples/poisoning_setting/customized_client.py +++ b/examples/poisoning_setting/customized_client.py @@ -8,25 +8,25 @@ from clip_norm import clip_grad_norm_ from torch.autograd import Variable -from fedscale.cloud.execution.client import Client +from fedscale.cloud.execution.torch_client import TorchClient -class Customized_Client(Client): +class Customized_Client(TorchClient): """Basic client component in Federated Learning""" def train(self, client_data, model, conf): """We flip the label of the malicious client""" - clientId = conf.clientId + client_id = conf.client_id """1 out of malicious_factor client is malicious""" - is_malicious = ((clientId+1) % conf.malicious_factor == 0) + is_malicious = ((client_id+1) % conf.malicious_factor == 0) if is_malicious: label_mapping = list(range(conf.num_class)) - np.random.seed(clientId) + np.random.seed(client_id) np.random.shuffle(label_mapping) - logging.info(f"Start to train (CLIENT: {clientId}) ...") + logging.info(f"Start to train (CLIENT: {client_id}) ...") device = conf.device last_model_params = [p.data.clone() for p in model.parameters()] @@ -96,14 +96,14 @@ def train(self, client_data, model, conf): state_dicts = model.state_dict() model_param = {p:state_dicts[p].data.cpu().numpy() for p in state_dicts} - results = {'clientId':clientId, 'moving_loss': epoch_train_loss, + results = {'client_id':client_id, 'moving_loss': epoch_train_loss, 'trained_size': completed_steps*conf.batch_size, 'success': completed_steps > 0} results['utility'] = math.sqrt(epoch_train_loss)*float(trained_unique_samples) if error_type is None: - logging.info(f"Training of (CLIENT: {clientId}) completes, {results}, is_malicious: {is_malicious}") + logging.info(f"Training of (CLIENT: {client_id}) completes, {results}, is_malicious: {is_malicious}") else: - logging.info(f"Training of (CLIENT: {clientId}) failed as {error_type}") + logging.info(f"Training of (CLIENT: {client_id}) failed as {error_type}") results['update_weight'] = model_param results['wall_duration'] = 0 diff --git a/examples/poisoning_setting/customized_executor.py b/examples/poisoning_setting/customized_executor.py index 15593403..800b1d0b 100644 --- a/examples/poisoning_setting/customized_executor.py +++ b/examples/poisoning_setting/customized_executor.py @@ -9,7 +9,7 @@ from fedscale.cloud.execution.executor import Executor -"""In this example, we only need to change the Client Component we need to import""" +"""In this example, we only need to change the TorchClient Component we need to import""" class Customized_Executor(Executor): """Each executor takes certain resource to run real training. diff --git a/examples/tensorflow_engine/tf_aggregator.py b/examples/tensorflow_engine/tf_aggregator.py deleted file mode 100644 index f3a2b64a..00000000 --- a/examples/tensorflow_engine/tf_aggregator.py +++ /dev/null @@ -1,34 +0,0 @@ -import os -import sys - -import tensorflow as tf - -import fedscale.cloud.config_parser as parser -from fedscale.cloud.aggregation.aggregator import Aggregator -from fedscale.cloud.logger.aggragation import * - - -class Customized_Aggregator(Aggregator): - """Feed aggregator using tensorflow models""" - def __init__(self, args): - super().__init__(args) - - def init_model(self): - """Load model""" - # CIFAR-10 as example - self.model = tf.keras.applications.resnet.ResNet50( - include_top=True, - weights=None, - input_tensor=None, - input_shape=[32, 32, 3], - pooling=None, - classes=10 - ) - # Initiate model parameters dictionary - self.model_weights = { - layer.name:[torch.from_numpy(p) for p in layer.get_weights()] for layer in self.model.layers - } - -if __name__ == "__main__": - aggregator = Customized_Aggregator(parser.args) - aggregator.run() \ No newline at end of file diff --git a/examples/tensorflow_engine/tf_client.py b/examples/tensorflow_engine/tf_client.py deleted file mode 100644 index 2556d8fb..00000000 --- a/examples/tensorflow_engine/tf_client.py +++ /dev/null @@ -1,62 +0,0 @@ -import logging -import math -import os -import sys - -import numpy as np -import tensorflow as tf -import torch - -from fedscale.cloud.execution.client import Client - - -class Customized_Client(Client): - """Inherit default client to use tensorflow engine""" - def __init__(self, conf): - pass - - def train(self, client_data, model, conf): - - clientId = conf.clientId - logging.info(f"Start to train (CLIENT: {clientId}) ...") - train_len = len(client_data) - - def gen(): - while True: - for x, y in client_data: - # Convert torch tensor to tf tensor - nx, ny = tf.convert_to_tensor(x.swapaxes(1, 3).numpy()), tf.convert_to_tensor(y.numpy()) - yield nx, ny - - # Sample a batch to get tensor properties - temp_x, temp_y = next(gen()) - - tf_client_data = tf.data.Dataset.from_generator( - gen, - output_types=(temp_x.dtype, temp_y.dtype), - output_shapes=(temp_x.shape, temp_y.shape) - ) - - optimizer = tf.keras.optimizers.SGD(learning_rate=conf.learning_rate, momentum=0.9, - nesterov=False, name='SGD') - model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', - metrics=['accuracy']) - - history = model.fit(tf_client_data, epochs=1, steps_per_epoch=conf.local_steps, verbose=0) - - # Report the training results - results = {'clientId': clientId, - 'moving_loss': sum(history.history['loss'])/(len(history.history['loss'])+1e-4), - 'trained_size': conf.local_steps*train_len, 'success': True, 'utility': 1} - - logging.info(f"Training of (CLIENT: {clientId}) completes, {results}") - - - results['update_weight'] = {layer.name:layer.get_weights() for layer in model.layers} - results['wall_duration'] = 0 - - return results - - - def test(self, conf): - pass diff --git a/examples/tensorflow_engine/tf_executor.py b/examples/tensorflow_engine/tf_executor.py deleted file mode 100644 index 077b880d..00000000 --- a/examples/tensorflow_engine/tf_executor.py +++ /dev/null @@ -1,41 +0,0 @@ -# -*- coding: utf-8 -*- - -import os -import sys - -import tensorflow as tf -from tf_client import Customized_Client - -import fedscale.cloud.config_parser as parser -from fedscale.cloud.execution.executor import Executor -from fedscale.cloud.logger.execution import * - -"""In this example, we only need to change the Client Component we need to import""" - -class Customized_Executor(Executor): - """Each executor takes certain resource to run real training. - Each run simulates the execution of an individual client""" - - def __init__(self, args): - super(Customized_Executor, self).__init__(args) - - def get_client_trainer(self, conf): - return Customized_Client(conf) - - def init_model(self): - """Return the model architecture used in training""" - model = tf.keras.applications.resnet.ResNet50( - include_top=True, - weights=None, - input_tensor=None, - input_shape=[32, 32, 3], - pooling=None, - classes=10 - ) - return model - - -if __name__ == "__main__": - executor = Customized_Executor(parser.args) - executor.run() - diff --git a/fedscale/cloud/aggregation/aggregator.py b/fedscale/cloud/aggregation/aggregator.py index 0534b530..635872ed 100755 --- a/fedscale/cloud/aggregation/aggregator.py +++ b/fedscale/cloud/aggregation/aggregator.py @@ -1,22 +1,29 @@ # -*- coding: utf-8 -*- - +import collections +import copy +import math import pickle +import random import threading +import time from concurrent import futures import grpc +import numpy as np import torch -from torch.utils.tensorboard import SummaryWriter import fedscale.cloud.channels.job_api_pb2_grpc as job_api_pb2_grpc -import fedscale.cloud.logger.aggragation as logger -import fedscale.cloud.config_parser as parser -from fedscale.cloud import commons +import fedscale.cloud.logger.aggregator_logging as logger +from fedscale.cloud.aggregation.optimizers import TorchServerOptimizer from fedscale.cloud.channels import job_api_pb2 +from fedscale.cloud.client_manager import ClientManager +from fedscale.cloud.internal.tensorflow_model_adapter import TensorflowModelAdapter +from fedscale.cloud.internal.torch_model_adapter import TorchModelAdapter from fedscale.cloud.resource_manager import ResourceManager from fedscale.cloud.fllibs import * +from torch.utils.tensorboard import SummaryWriter -MAX_MESSAGE_LENGTH = 1*1024*1024*1024 # 1GB +MAX_MESSAGE_LENGTH = 1 * 1024 * 1024 * 1024 # 1GB class Aggregator(job_api_pb2_grpc.JobServiceServicer): @@ -26,12 +33,12 @@ class Aggregator(job_api_pb2_grpc.JobServiceServicer): args (dictionary): Variable arguments for fedscale runtime config. defaults to the setup in arg_parser.py """ + def __init__(self, args): # init aggregator loger logger.initiate_aggregator_setting() logging.info(f"Job args {args}") - self.args = args self.experiment_mode = args.experiment_mode self.device = args.cuda_device if args.use_cuda else torch.device( @@ -45,16 +52,11 @@ def __init__(self, args): self.client_manager = self.init_client_manager(args=args) # ======== model and data ======== - self.model = None + self.model_wrapper = None self.model_in_update = 0 self.update_lock = threading.Lock() # all weights including bias/#_batch_tracked (e.g., state_dict) - self.model_weights = collections.OrderedDict() - self.last_gradient_weights = [] # only gradient variables - self.model_state_dict = None - # NOTE: if (e.g., model.parameters() in PyTorch), then False - # True, if (e.g., layer.get_weights() in Tensorflow) - self.using_group_params = self.args.engine == commons.TENSORFLOW + self.model_weights = None # ======== channels ======== self.connection_timeout = self.args.connection_timeout @@ -62,7 +64,7 @@ def __init__(self, args): self.grpc_server = None # ======== Event Queue ======= - self.individual_client_events = {} # Unicast + self.individual_client_events = {} # Unicast self.sever_events_queue = collections.deque() self.broadcast_events_queue = collections.deque() # Broadcast @@ -81,7 +83,6 @@ def __init__(self, args): self.model_update_size = 0. self.collate_fn = None - self.task = args.task self.round = 0 self.start_run_time = time.time() @@ -95,7 +96,8 @@ def __init__(self, args): self.registered_executor_info = set() self.test_result_accumulator = [] self.testing_history = {'data_set': args.data_set, 'model': args.model, 'sample_mode': args.sample_mode, - 'gradient_policy': args.gradient_policy, 'task': args.task, 'perf': collections.OrderedDict()} + 'gradient_policy': args.gradient_policy, 'task': args.task, + 'perf': collections.OrderedDict()} self.log_writer = SummaryWriter(log_dir=logger.logDir) # ======== Task specific ============ @@ -105,8 +107,6 @@ def setup_env(self): """Set up experiments environment and server optimizer """ self.setup_seed(seed=1) - self.optimizer = ServerOptimizer( - self.args.gradient_policy, self.args, self.device) def setup_seed(self, seed=1): """Set global random seed for better reproducibility @@ -160,14 +160,17 @@ def init_data_communication(self): pass def init_model(self): - """Load the model architecture - """ - assert self.args.engine == commons.PYTORCH, "Please define model for non-PyTorch models" - - self.model = init_model() - - # Initiate model parameters dictionary - self.model_weights = self.model.state_dict() + """Initialize the model""" + if self.args.engine == commons.TENSORFLOW: + self.model_wrapper = TensorflowModelAdapter(init_model()) + elif self.args.engine == commons.PYTORCH: + self.model_wrapper = TorchModelAdapter( + init_model(), + optimizer=TorchServerOptimizer( + self.args.gradient_policy, self.args, self.device)) + else: + raise ValueError(f"{self.args.engine} is not a supported engine.") + self.model_weights = self.model_wrapper.get_weights() def init_task_context(self): """Initiate execution context for specific tasks @@ -217,7 +220,7 @@ def load_client_profile(self, file_path): global_client_profile = {} if os.path.exists(file_path): with open(file_path, 'rb') as fin: - # {clientId: [computer, bandwidth]} + # {client_id: [computer, bandwidth]} global_client_profile = pickle.load(fin) return global_client_profile @@ -233,17 +236,17 @@ def client_register_handler(self, executorId, info): logging.info(f"Loading {len(info['size'])} client traces ...") for _size in info['size']: # since the worker rankId starts from 1, we also configure the initial dataId as 1 - mapped_id = (self.num_of_clients+1) % len( + mapped_id = (self.num_of_clients + 1) % len( self.client_profiles) if len(self.client_profiles) > 0 else 1 systemProfile = self.client_profiles.get( mapped_id, {'computation': 1.0, 'communication': 1.0}) - clientId = ( - self.num_of_clients+1) if self.experiment_mode == commons.SIMULATION_MODE else executorId + client_id = ( + self.num_of_clients + 1) if self.experiment_mode == commons.SIMULATION_MODE else executorId self.client_manager.register_client( - executorId, clientId, size=_size, speed=systemProfile) + executorId, client_id, size=_size, speed=systemProfile) self.client_manager.registerDuration( - clientId, + client_id, batch_size=self.args.batch_size, local_steps=self.args.local_steps, upload_size=self.model_update_size, @@ -264,7 +267,8 @@ def executor_info_handler(self, executorId, info): """ self.registered_executor_info.add(executorId) - logging.info(f"Received executor {executorId} information, {len(self.registered_executor_info)}/{len(self.executors)}") + logging.info( + f"Received executor {executorId} information, {len(self.registered_executor_info)}/{len(self.executors)}") # In this simulation, we run data split on each worker, so collecting info from one executor is enough # Waiting for data information from executors, or timeout @@ -303,11 +307,13 @@ def tictak_client_tasks(self, sampled_clients, num_clients_to_collect): client_cfg = self.client_conf.get(client_to_run, self.args) exe_cost = self.client_manager.get_completion_time(client_to_run, - batch_size=client_cfg.batch_size, local_steps=client_cfg.local_steps, - upload_size=self.model_update_size, download_size=self.model_update_size) + batch_size=client_cfg.batch_size, + local_steps=client_cfg.local_steps, + upload_size=self.model_update_size, + download_size=self.model_update_size) roundDuration = exe_cost['computation'] + \ - exe_cost['communication'] + exe_cost['communication'] # if the client is not active by the time of collection, we consider it is lost in this round if self.client_manager.isClientActive(client_to_run, roundDuration + self.global_virtual_clock): sampledClientsReal.append(client_to_run) @@ -317,13 +323,13 @@ def tictak_client_tasks(self, sampled_clients, num_clients_to_collect): num_clients_to_collect = min( num_clients_to_collect, len(completionTimes)) # 2. get the top-k completions to remove stragglers - sortedWorkersByCompletion = sorted( + workers_sorted_by_completion_time = sorted( range(len(completionTimes)), key=lambda k: completionTimes[k]) - top_k_index = sortedWorkersByCompletion[:num_clients_to_collect] + top_k_index = workers_sorted_by_completion_time[:num_clients_to_collect] clients_to_run = [sampledClientsReal[k] for k in top_k_index] dummy_clients = [sampledClientsReal[k] - for k in sortedWorkersByCompletion[num_clients_to_collect:]] + for k in workers_sorted_by_completion_time[num_clients_to_collect:]] round_duration = completionTimes[top_k_index[-1]] completionTimes.sort() @@ -346,9 +352,8 @@ def run(self): self.init_data_communication() self.init_model() - self.save_last_param() self.model_update_size = sys.getsizeof( - pickle.dumps(self.model))/1024.0*8. # kbits + pickle.dumps(self.model_wrapper)) / 1024.0 * 8. # kbits self.client_profiles = self.load_client_profile( file_path=self.args.device_conf_file) @@ -366,7 +371,7 @@ def select_participants(self, select_num_participants, overcommitment=1.3): """ return sorted(self.client_manager.select_participants( - int(select_num_participants*overcommitment), + int(select_num_participants * overcommitment), cur_time=self.global_virtual_clock), ) @@ -379,7 +384,7 @@ def client_completion_handler(self, results): """ # Format: - # -results = {'clientId':clientId, 'update_weight': model_param, 'moving_loss': round_train_loss, + # -results = {'client_id':client_id, 'update_weight': model_param, 'moving_loss': round_train_loss, # 'trained_size': count, 'wall_duration': time_cost, 'success': is_success 'utility': utility} if self.args.gradient_policy in ['q-fedavg']: @@ -388,124 +393,67 @@ def client_completion_handler(self, results): self.stats_util_accumulator.append(results['utility']) self.loss_accumulator.append(results['moving_loss']) - self.client_manager.register_feedback(results['clientId'], results['utility'], - auxi=math.sqrt( - results['moving_loss']), - time_stamp=self.round, - duration=self.virtual_client_clock[results['clientId']]['computation'] + - self.virtual_client_clock[results['clientId']]['communication'] - ) + self.client_manager.register_feedback(results['client_id'], results['utility'], + auxi=math.sqrt( + results['moving_loss']), + time_stamp=self.round, + duration=self.virtual_client_clock[results['client_id']]['computation'] + + self.virtual_client_clock[results['client_id']]['communication'] + ) # ================== Aggregate weights ====================== self.update_lock.acquire() self.model_in_update += 1 - if self.using_group_params == True: - self.aggregate_client_group_weights(results) - else: - self.aggregate_client_weights(results) + self.update_weight_aggregation(results['update_weight']) self.update_lock.release() - def aggregate_client_weights(self, results): - """May aggregate client updates on the fly - - Args: - results (dictionary): client's training result - - [FedAvg] "Communication-Efficient Learning of Deep Networks from Decentralized Data". - H. Brendan McMahan, Eider Moore, Daniel Ramage, Seth Hampson, Blaise Aguera y Arcas. AISTATS, 2017 - """ - # Start to take the average of updates, and we do not keep updates to save memory - # Importance of each update is 1/#_of_participants - # importance = 1./self.tasks_round - - for p in results['update_weight']: - param_weight = results['update_weight'][p] - if isinstance(param_weight, list): - param_weight = np.asarray(param_weight, dtype=np.float32) - param_weight = torch.from_numpy( - param_weight).to(device=self.device) - - if self.model_in_update == 1: - self.model_weights[p].data = param_weight - else: - self.model_weights[p].data += param_weight - - if self.model_in_update == self.tasks_round: - for p in self.model_weights: - d_type = self.model_weights[p].data.dtype - - self.model_weights[p].data = ( - self.model_weights[p]/float(self.tasks_round)).to(dtype=d_type) - - def aggregate_client_group_weights(self, results): - """Streaming weight aggregation. Similar to aggregate_client_weights, - but each key corresponds to a group of weights (e.g., for Tensorflow) - - Args: - results (dictionary): Client's training result - - """ - for p_g in results['update_weight']: - param_weights = results['update_weight'][p_g] - for idx, param_weight in enumerate(param_weights): - if isinstance(param_weight, list): - param_weight = np.asarray(param_weight, dtype=np.float32) - param_weight = torch.from_numpy( - param_weight).to(device=self.device) - - if self.model_in_update == 1: - self.model_weights[p_g][idx].data = param_weight - else: - self.model_weights[p_g][idx].data += param_weight - - if self.model_in_update == self.tasks_round: - for p in self.model_weights: - for idx in range(len(self.model_weights[p])): - d_type = self.model_weights[p][idx].data.dtype - - self.model_weights[p][idx].data = ( - self.model_weights[p][idx].data/float(self.tasks_round) - ).to(dtype=d_type) - - def save_last_param(self): - """ Save the last model parameters - """ - if self.args.engine == commons.TENSORFLOW: - self.last_gradient_weights = [ - layer.get_weights() for layer in self.model.layers] - self.model_weights = copy.deepcopy(self.model.state_dict()) + def update_weight_aggregation(self, update_weights): + if type(update_weights) is dict: + update_weights = [x for x in update_weights.values()] + if self.model_in_update == 1: + self.model_weights = update_weights else: - self.last_gradient_weights = [ - p.data.clone() for p in self.model.parameters()] - self.model_weights = copy.deepcopy(self.model.state_dict()) + self.model_weights = [weight + update_weights[i] for i, weight in enumerate(self.model_weights)] + if self.model_in_update == self.tasks_round: + self.model_weights = [np.divide(weight, self.tasks_round) for weight in self.model_weights] + self.model_wrapper.set_weights(copy.deepcopy(self.model_weights)) + + def aggregate_test_result(self): + accumulator = self.test_result_accumulator[0] + for i in range(1, len(self.test_result_accumulator)): + if self.args.task == "detection": + for key in accumulator: + if key == "boxes": + for j in range(596): + accumulator[key][j] = accumulator[key][j] + \ + self.test_result_accumulator[i][key][j] + else: + accumulator[key] += self.test_result_accumulator[i][key] + else: + for key in accumulator: + accumulator[key] += self.test_result_accumulator[i][key] + self.testing_history['perf'][self.round] = {'round': self.round, 'clock': self.global_virtual_clock} + for metric_name in accumulator.keys(): + if metric_name == 'test_loss': + self.testing_history['perf'][self.round]['loss'] = accumulator['test_loss'] \ + if self.args.task == "detection" else accumulator['test_loss'] / accumulator['test_len'] + elif metric_name not in ['test_len']: + self.testing_history['perf'][self.round][metric_name] \ + = accumulator[metric_name] / accumulator['test_len'] + + round_perf = self.testing_history['perf'][self.round] + logging.info( + "FL Testing in round: {}, virtual_clock: {}, results: {}" + .format(self.round, self.global_virtual_clock, round_perf)) def update_default_task_config(self): """Update the default task configuration after each round """ if self.round % self.args.decay_round == 0: self.args.learning_rate = max( - self.args.learning_rate*self.args.decay_factor, self.args.min_learning_rate) - - def round_weight_handler(self, last_model): - """Update model when the round completes - - Args: - last_model (list): A list of global model weight in last round. - - """ - if self.round > 1: - if self.args.engine == commons.TENSORFLOW: - for layer in self.model.layers: - layer.set_weights([p.cpu().detach().numpy() - for p in self.model_weights[layer.name]]) - else: - self.model.load_state_dict(self.model_weights) - current_grad_weights = [param.data.clone() - for param in self.model.parameters()] - self.optimizer.update_round_gradient( - last_model, current_grad_weights, self.model) + self.args.learning_rate * self.args.decay_factor, self.args.min_learning_rate) def round_completion_handler(self): """Triggered upon the round completion, it registers the last round execution info, @@ -513,22 +461,18 @@ def round_completion_handler(self): """ self.global_virtual_clock += self.round_duration self.round += 1 - - # handle the global update w/ current and last - self.round_weight_handler(self.last_gradient_weights) - - avgUtilLastround = sum(self.stats_util_accumulator) / \ - max(1, len(self.stats_util_accumulator)) + last_round_avg_util = sum(self.stats_util_accumulator) / \ + max(1, len(self.stats_util_accumulator)) # assign avg reward to explored, but not ran workers - for clientId in self.round_stragglers: - self.client_manager.register_feedback(clientId, avgUtilLastround, - time_stamp=self.round, - duration=self.virtual_client_clock[clientId]['computation'] + - self.virtual_client_clock[clientId]['communication'], - success=False) + for client_id in self.round_stragglers: + self.client_manager.register_feedback(client_id, last_round_avg_util, + time_stamp=self.round, + duration=self.virtual_client_clock[client_id]['computation'] + + self.virtual_client_clock[client_id]['communication'], + success=False) avg_loss = sum(self.loss_accumulator) / \ - max(1, len(self.loss_accumulator)) + max(1, len(self.loss_accumulator)) logging.info(f"Wall clock: {round(self.global_virtual_clock)} s, round: {self.round}, Planned participants: " + f"{len(self.sampled_participants)}, Succeed participants: {len(self.stats_util_accumulator)}, Training loss: {avg_loss}") @@ -539,7 +483,8 @@ def round_completion_handler(self): # update select participants self.sampled_participants = self.select_participants( select_num_participants=self.args.num_participants, overcommitment=self.args.overcommitment) - (clientsToRun, round_stragglers, virtual_client_clock, round_duration, flatten_client_duration) = self.tictak_client_tasks( + (clientsToRun, round_stragglers, virtual_client_clock, round_duration, + flatten_client_duration) = self.tictak_client_tasks( self.sampled_participants, self.args.num_participants) logging.info(f"Selected participants to run: {clientsToRun}") @@ -555,11 +500,9 @@ def round_completion_handler(self): else: self.sampled_executors = [str(c_id) for c_id in self.sampled_participants] - - self.save_last_param() self.round_stragglers = round_stragglers self.virtual_client_clock = virtual_client_clock - self.flatten_client_duration = numpy.array(flatten_client_duration) + self.flatten_client_duration = np.array(flatten_client_duration) self.round_duration = round_duration self.model_in_update = 0 self.test_result_accumulator = [] @@ -570,7 +513,7 @@ def round_completion_handler(self): if self.round >= self.args.rounds: self.broadcast_aggregator_events(commons.SHUT_DOWN) - elif self.round % self.args.eval_interval == 0: + elif self.round % self.args.eval_interval == 0 or self.round == 1: self.broadcast_aggregator_events(commons.UPDATE_MODEL) self.broadcast_aggregator_events(commons.MODEL_TEST) else: @@ -582,9 +525,9 @@ def log_train_result(self, avg_loss): """ self.log_writer.add_scalar('Train/round_to_loss', avg_loss, self.round) self.log_writer.add_scalar( - 'FAR/time_to_train_loss (min)', avg_loss, self.global_virtual_clock/60.) + 'FAR/time_to_train_loss (min)', avg_loss, self.global_virtual_clock / 60.) self.log_writer.add_scalar( - 'FAR/round_duration (min)', self.round_duration/60., self.round) + 'FAR/round_duration (min)', self.round_duration / 60., self.round) self.log_writer.add_histogram( 'FAR/client_duration (min)', self.flatten_client_duration, self.round) @@ -596,9 +539,9 @@ def log_test_result(self): self.log_writer.add_scalar( 'Test/round_to_accuracy', self.testing_history['perf'][self.round]['top_1'], self.round) self.log_writer.add_scalar('FAR/time_to_test_loss (min)', self.testing_history['perf'][self.round]['loss'], - self.global_virtual_clock/60.) + self.global_virtual_clock / 60.) self.log_writer.add_scalar('FAR/time_to_test_accuracy (min)', self.testing_history['perf'][self.round]['top_1'], - self.global_virtual_clock/60.) + self.global_virtual_clock / 60.) def deserialize_response(self, responses): """Deserialize the response from executor @@ -641,9 +584,7 @@ def testing_completion_handler(self, client_id, results): if len(self.test_result_accumulator) == len(self.executors): - logger.aggregate_test_result( - self.test_result_accumulator, self.args.task, \ - self.round, self.global_virtual_clock, self.testing_history) + self.aggregate_test_result() # Dump the testing result with open(os.path.join(logger.logDir, 'testing_perf'), 'wb') as fout: pickle.dump(self.testing_history, fout) @@ -677,15 +618,15 @@ def dispatch_client_events(self, event, clients=None): for client_id in clients: self.individual_client_events[client_id].append(event) - def get_client_conf(self, clientId): + def get_client_conf(self, client_id): """Training configurations that will be applied on clients, developers can further define personalized client config here. Args: - clientId (int): The client id. + client_id (int): The client id. Returns: - dictionary: Client training config. + dictionary: TorchClient training config. """ conf = { @@ -703,14 +644,13 @@ def create_client_task(self, executorId): tuple: Training config for new task. (dictionary, PyTorch or TensorFlow module) """ - next_clientId = self.resource_manager.get_next_task(executorId) + next_client_id = self.resource_manager.get_next_task(executorId) train_config = None # NOTE: model = None then the executor will load the global model broadcasted in UPDATE_MODEL - model = None - if next_clientId != None: - config = self.get_client_conf(next_clientId) - train_config = {'client_id': next_clientId, 'task_config': config} - return train_config, model + if next_client_id != None: + config = self.get_client_conf(next_client_id) + train_config = {'client_id': next_client_id, 'task_config': config} + return train_config, self.model_wrapper.get_weights() def get_test_config(self, client_id): """FL model testing on clients, developers can further define personalized client config here. @@ -724,20 +664,11 @@ def get_test_config(self, client_id): """ return {'client_id': client_id} - def get_global_model(self): - """Get global model that would be used by all FL clients (in default FL) - - Returns: - PyTorch or TensorFlow module: Based on the executor's machine learning framework, initialize and return the model for training. - - """ - return self.model - def get_shutdown_config(self, client_id): """Shutdown config for client, developers can further define personalized client config here. Args: - client_id (int): Client id. + client_id (int): TorchClient id. Returns: dictionary: Shutdown config for new task. @@ -758,7 +689,7 @@ def add_event_handler(self, client_id, event, meta, data): self.sever_events_queue.append((client_id, event, meta, data)) def CLIENT_REGISTER(self, request, context): - """FL Client register to the aggregator + """FL TorchClient register to the aggregator Args: request (RegisterRequest): Registeration request info from executor. @@ -813,11 +744,11 @@ def CLIENT_PING(self, request, context): current_event = commons.DUMMY_EVENT if self.experiment_mode != commons.SIMULATION_MODE: self.individual_client_events[executor_id].append( - commons.CLIENT_TRAIN) + commons.CLIENT_TRAIN) elif current_event == commons.MODEL_TEST: response_msg = self.get_test_config(client_id) elif current_event == commons.UPDATE_MODEL: - response_data = self.get_global_model() + response_data = self.model_wrapper.get_weights() elif current_event == commons.SHUT_DOWN: response_msg = self.get_shutdown_config(executor_id) @@ -825,7 +756,7 @@ def CLIENT_PING(self, request, context): response_msg), self.serialize_response(response_data) # NOTE: in simulation mode, response data is pickle for faster (de)serialization response = job_api_pb2.ServerResponse(event=current_event, - meta=response_msg, data=response_data) + meta=response_msg, data=response_data) if current_event != commons.DUMMY_EVENT: logging.info(f"Issue EVENT ({current_event}) to EXECUTOR ({executor_id})") @@ -852,7 +783,7 @@ def CLIENT_EXECUTE_COMPLETION(self, request, context): if execution_status is False: logging.error(f"Executor {executor_id} fails to run client {client_id}, due to {execution_msg}") - # TODO: whether we should schedule tasks when client_ping or client_complete + # TODO: whether we should schedule tasks when client_ping or client_complete if self.resource_manager.has_next_task(executor_id): # NOTE: we do not pop the train immediately in simulation mode, # since the executor may run multiple clients diff --git a/fedscale/cloud/aggregation/optimizers.py b/fedscale/cloud/aggregation/optimizers.py index 796c7a16..47580e2c 100644 --- a/fedscale/cloud/aggregation/optimizers.py +++ b/fedscale/cloud/aggregation/optimizers.py @@ -1,4 +1,4 @@ -class ServerOptimizer(object): +class TorchServerOptimizer(object): """This is a abstract server optimizer class Args: diff --git a/fedscale/cloud/channels/job_api_pb2_grpc.py b/fedscale/cloud/channels/job_api_pb2_grpc.py index b1a45fad..9f56cd0b 100644 --- a/fedscale/cloud/channels/job_api_pb2_grpc.py +++ b/fedscale/cloud/channels/job_api_pb2_grpc.py @@ -1,5 +1,5 @@ # Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT! -"""Client and server classes corresponding to protobuf-defined services.""" +"""TorchClient and server classes corresponding to protobuf-defined services.""" import grpc import fedscale.cloud.channels.job_api_pb2 as job__api__pb2 diff --git a/fedscale/cloud/commons.py b/fedscale/cloud/commons.py index 89ec778e..3bb159ee 100644 --- a/fedscale/cloud/commons.py +++ b/fedscale/cloud/commons.py @@ -1,9 +1,8 @@ - # Define Basic Experiment Setup +from enum import Enum + SIMULATION_MODE = 'simulation' DEPLOYMENT_MODE = 'deployment' -TENSORFLOW = 'tensorflow' -PYTORCH = 'pytorch' # Define Basic FL Events UPDATE_MODEL = 'update_model' @@ -17,3 +16,7 @@ # PLACEHOLD DUMMY_RESPONSE = 'N' + + +TENSORFLOW = 'tensorflow' +PYTORCH = 'pytorch' diff --git a/fedscale/cloud/config_parser.py b/fedscale/cloud/config_parser.py index e0188e84..d5cf1476 100644 --- a/fedscale/cloud/config_parser.py +++ b/fedscale/cloud/config_parser.py @@ -9,7 +9,7 @@ # The basic configuration of the cluster parser.add_argument('--ps_ip', type=str, default='127.0.0.1') -parser.add_argument('--ps_port', type=str, default='29501') +parser.add_argument('--ps_port', type=str, default='29500') parser.add_argument('--this_rank', type=int, default=1) parser.add_argument('--connection_timeout', type=int, default=60) parser.add_argument('--experiment_mode', type=str, @@ -31,7 +31,8 @@ # The configuration of model and dataset parser.add_argument('--model_zoo', type=str, default='torchcv', - help="model zoo to load the models from", choices=["torchcv", "fedscale-zoo"]) + help="model zoo to load the models from", choices=["torchcv", "fedscale-torch-zoo", + "fedscale-tensorflow-zoo"]) parser.add_argument('--data_dir', type=str, default='~/cifar10/') parser.add_argument('--device_conf_file', type=str, default='/tmp/client.cfg') parser.add_argument('--model', type=str, default='shufflenet_v2_x2_0') @@ -50,7 +51,7 @@ parser.add_argument('--blacklist_max_len', type=float, default=0.3) parser.add_argument('--embedding_file', type=str, default='glove.840B.300d.txt') -parser.add_argument('--input_shape', type=tuple, default=(1, 3, 28, 28)) +parser.add_argument('--input_shape', type=int, nargs='+', default=[1, 3, 28, 28]) # The configuration of different hyper-parameters for training diff --git a/fedscale/cloud/execution/client_base.py b/fedscale/cloud/execution/client_base.py new file mode 100644 index 00000000..5163a7cc --- /dev/null +++ b/fedscale/cloud/execution/client_base.py @@ -0,0 +1,40 @@ +import abc + +from fedscale.cloud.internal.model_adapter_base import ModelAdapterBase + + +class ClientBase(abc.ABC): + """ + Represents a framework-agnostic FL client that can perform training and evaluation. + """ + + @abc.abstractmethod + def train(self, client_data, model, conf): + """ + Perform a training task. + :param client_data: client training dataset + :param model: the framework-specific model + :param conf: job config + :return: training results + """ + pass + + @abc.abstractmethod + def test(self, client_data, model, conf): + """ + Perform a testing task. + :param client_data: client evaluation dataset + :param model: the framework-specific model + :param conf: job config + :return: testing results + """ + pass + + @abc.abstractmethod + def get_model_adapter(self, model) -> ModelAdapterBase: + """ + Return framework-specific model adapter. + :param model: the model + :return: a model adapter containing the model + """ + pass diff --git a/fedscale/cloud/execution/data_processor.py b/fedscale/cloud/execution/data_processor.py index 45532930..a280cd12 100644 --- a/fedscale/cloud/execution/data_processor.py +++ b/fedscale/cloud/execution/data_processor.py @@ -1,5 +1,4 @@ -import os - +import torch from torch.nn.utils.rnn import pad_sequence from fedscale.cloud.fllibs import * @@ -14,9 +13,6 @@ def collate(examples): def voice_collate_fn(batch): def func(p): return p[0].size(1) - - start_time = time.time() - batch = sorted(batch, key=lambda sample: sample[0].size(1), reverse=True) longest_sample = max(batch, key=func)[0] freq_size = longest_sample.size(0) @@ -36,7 +32,4 @@ def func(p): target_sizes[x] = len(target) targets.extend(target) targets = torch.IntTensor(targets) - - end_time = time.time() - return (inputs, targets, input_percentages, target_sizes), None diff --git a/fedscale/cloud/execution/executor.py b/fedscale/cloud/execution/executor.py index bbdedaed..f63dd397 100755 --- a/fedscale/cloud/execution/executor.py +++ b/fedscale/cloud/execution/executor.py @@ -2,19 +2,22 @@ import collections import gc import pickle +import random +import time from argparse import Namespace +import numpy as np import torch import fedscale.cloud.channels.job_api_pb2 as job_api_pb2 -import fedscale.cloud.logger.execution as logger -import fedscale.cloud.config_parser as parser -from fedscale.cloud import commons +import fedscale.cloud.logger.executor_logging as logger from fedscale.cloud.channels.channel_context import ClientConnections -from fedscale.cloud.execution.client import Client +from fedscale.cloud.execution.tensorflow_client import TensorflowClient +from fedscale.cloud.execution.torch_client import TorchClient from fedscale.cloud.execution.data_processor import collate, voice_collate_fn -from fedscale.cloud.execution.rlclient import RLClient +from fedscale.cloud.execution.rl_client import RLClient from fedscale.cloud.fllibs import * +from fedscale.dataloaders.divide_data import DataPartitioner, select_dataset class Executor(object): @@ -24,13 +27,14 @@ class Executor(object): args (dictionary): Variable arguments for fedscale runtime config. defaults to the setup in arg_parser.py """ + def __init__(self, args): # initiate the executor log path, and executor ips logger.initiate_client_setting() + self.model_adapter = self.get_client_trainer(args).get_model_adapter(init_model()) + self.args = args - self.device = args.cuda_device if args.use_cuda else torch.device( - 'cpu') self.num_executors = args.num_executors # ======== env information ======== self.this_rank = args.this_rank @@ -38,8 +42,6 @@ def __init__(self, args): # ======== model and data ======== self.training_sets = self.test_dataset = None - self.temp_model_path = os.path.join( - logger.logDir, 'model_'+str(args.this_rank)+'.pth.tar') # ======== channels ======== self.aggregator_communicator = ClientConnections( @@ -47,7 +49,6 @@ def __init__(self, args): # ======== runtime information ======== self.collate_fn = None - self.task = args.task self.round = 0 self.start_run_time = time.time() self.received_stop_request = False @@ -75,10 +76,10 @@ def setup_seed(self, seed=1): """ torch.manual_seed(seed) + torch.backends.cudnn.deterministic = True torch.cuda.manual_seed_all(seed) np.random.seed(seed) random.seed(seed) - torch.backends.cudnn.deterministic = True def init_control_communication(self): """Create communication channel between coordinator and executor. @@ -91,18 +92,6 @@ def init_data_communication(self): """ pass - def init_model(self): - """Get the model architecture used in training - - Returns: - PyTorch or TensorFlow module: Based on the executor's machine learning framework, initialize and return the model for training - - """ - assert self.args.engine == commons.PYTORCH, "Please override this function to define non-PyTorch models" - model = init_model() - model = model.to(device=self.device) - return model - def init_data(self): """Return the training and testing dataset @@ -111,9 +100,13 @@ def init_data(self): """ train_dataset, test_dataset = init_dataset() - if self.task == "rl": + if self.args.task == "rl": return train_dataset, test_dataset - # load data partitioner (entire_train_data) + if self.args.task == 'nlp': + self.collate_fn = collate + elif self.args.task == 'voice': + self.collate_fn = voice_collate_fn + # load data partitionxr (entire_train_data) logging.info("Data partitioner starts ...") training_sets = DataPartitioner( @@ -127,11 +120,6 @@ def init_data(self): logging.info("Data partitioner completes ...") - if self.task == 'nlp': - self.collate_fn = collate - elif self.task == 'voice': - self.collate_fn = voice_collate_fn - return training_sets, testing_sets def run(self): @@ -144,10 +132,10 @@ def run(self): def dispatch_worker_events(self, request): """Add new events to worker queues - + Args: request (string): Add grpc request from server (e.g. MODEL_TEST, MODEL_TRAIN) to event_queue. - + """ self.event_queue.append(request) @@ -159,7 +147,7 @@ def deserialize_response(self, responses): Returns: ServerResponse defined at job_api.proto: The deserialized response object from server. - + """ return pickle.loads(responses) @@ -167,22 +155,23 @@ def serialize_response(self, responses): """Serialize the response to send to server upon assigned job completion Args: - responses (string, bool, or bytes): Client responses after job completion. + responses (string, bool, or bytes): TorchClient responses after job completion. Returns: bytes stream: The serialized response object to server. - + """ return pickle.dumps(responses) - def UpdateModel(self, config): + def UpdateModel(self, model_weights): """Receive the broadcasted global model for current round Args: config (PyTorch or TensorFlow model): The broadcasted global model config - + """ - self.update_model_handler(model=config) + self.round += 1 + self.model_adapter.set_weights(model_weights) def Train(self, config): """Load train config and data to start training on that client @@ -190,19 +179,17 @@ def Train(self, config): Args: config (dictionary): The client training config. - Returns: + Returns: tuple (int, dictionary): The client id and train result """ client_id, train_config = config['client_id'], config['task_config'] - model = None - if 'model' in config and config['model'] is not None: - model = config['model'] - + if 'model' not in config or not config['model']: + raise "The 'model' object must be a non-null value in the training config." client_conf = self.override_conf(train_config) train_res = self.training_handler( - clientId=client_id, conf=client_conf, model=model) + client_id=client_id, conf=client_conf, model=config['model']) # Report execution completion meta information response = self.aggregator_communicator.stub.CLIENT_EXECUTE_COMPLETION( @@ -218,12 +205,12 @@ def Train(self, config): def Test(self, config): """Model Testing. By default, we test the accuracy on all data of clients in the test group - + Args: config (dictionary): The client testing config. - + """ - test_res = self.testing_handler(args=self.args, config=config) + test_res = self.testing_handler() test_res = {'executorId': self.this_rank, 'results': test_res} # Report execution completion information @@ -251,30 +238,6 @@ def report_executor_info_handler(self): """ return self.training_sets.getSize() - def update_model_handler(self, model): - """Update the model copy on this executor - - Args: - config (PyTorch or TensorFlow model): The broadcasted global model - - """ - self.round += 1 - - # Dump latest model to disk - with open(self.temp_model_path, 'wb') as model_out: - pickle.dump(model, model_out) - - def load_global_model(self): - """ Load last global model - - Returns: - PyTorch or TensorFlow model: The lastest global model - - """ - with open(self.temp_model_path, 'rb') as model_in: - model = pickle.load(model_in) - return model - def override_conf(self, config): """ Override the variable arguments for different client @@ -293,53 +256,48 @@ def override_conf(self, config): return Namespace(**default_conf) def get_client_trainer(self, conf): - """A abstract base class for client with training handler, developer can redefine to this function to customize the client training: - - Args: - config (dictionary): The client runtime config. - - Returns: - Client: A abstract base client class with runtime config conf. - """ - return Client(conf) + Returns a framework-specific client that handles training and evaluation. + :param conf: job config + :return: framework-specific client instance + """ + if conf.engine == commons.TENSORFLOW: + return TensorflowClient(conf) + elif conf.engine == commons.PYTORCH: + if conf.task == 'rl': + return RLClient(conf) + else: + return TorchClient(conf) + raise "Currently, FedScale supports tensorflow and pytorch." - def training_handler(self, clientId, conf, model=None): + def training_handler(self, client_id, conf, model): """Train model given client id - + Args: - clientId (int): The client id. + client_id (int): The client id. conf (dictionary): The client runtime config. Returns: dictionary: The train result - - """ - # load last global model - client_model = self.load_global_model() if model is None else model - conf.clientId, conf.device = clientId, self.device + """ + self.model_adapter.set_weights(model) + conf.client_id = client_id conf.tokenizer = tokenizer - if self.args.task == "rl": - client_data = self.training_sets - client = RLClient(conf) - train_res = client.train( - client_data=client_data, model=client_model, conf=conf) - else: - client_data = select_dataset(clientId, self.training_sets, - batch_size=conf.batch_size, args=self.args, - collate_fn=self.collate_fn - ) - - client = self.get_client_trainer(conf) - train_res = client.train( - client_data=client_data, model=client_model, conf=conf) + client_data = self.training_sets if self.args.task == "rl" else \ + select_dataset(client_id, self.training_sets, + batch_size=conf.batch_size, args=self.args, + collate_fn=self.collate_fn + ) + client = self.get_client_trainer(self.args) + train_res = client.train( + client_data=client_data, model=self.model_adapter.get_model(), conf=conf) return train_res - def testing_handler(self, args, config=None): + def testing_handler(self): """Test model - + Args: args (dictionary): Variable arguments for fedscale runtime config. defaults to the setup in arg_parser.py config (dictionary): Variable arguments from coordinator. @@ -347,37 +305,21 @@ def testing_handler(self, args, config=None): dictionary: The test result """ - evalStart = time.time() - device = self.device - model = self.load_global_model() - if self.task == 'rl': - client = RLClient(args) - test_res = client.test(args, self.this_rank, model, device=device) - _, _, _, testResults = test_res - else: - data_loader = select_dataset(self.this_rank, self.testing_sets, - batch_size=args.test_bsz, args=args, - isTest=True, collate_fn=self.collate_fn - ) - - if self.task == 'voice': - criterion = CTCLoss(reduction='mean').to(device=device) - else: - criterion = torch.nn.CrossEntropyLoss().to(device=device) - - if self.args.engine == commons.PYTORCH: - test_res = test_model(self.this_rank, model, data_loader, - device=device, criterion=criterion, tokenizer=tokenizer) - else: - raise Exception(f"Need customized implementation for model testing in {self.args.engine} engine") - - test_loss, acc, acc_5, testResults = test_res - logging.info("After aggregation round {}, CumulTime {}, eval_time {}, test_loss {}, test_accuracy {:.2f}%, test_5_accuracy {:.2f}% \n" - .format(self.round, round(time.time() - self.start_run_time, 4), round(time.time() - evalStart, 4), test_loss, acc*100., acc_5*100.)) + test_config = self.override_conf({ + 'rank': self.this_rank, + 'memory_capacity': self.args.memory_capacity, + 'tokenizer': tokenizer + }) + client = self.get_client_trainer(test_config) + data_loader = select_dataset(self.this_rank, self.testing_sets, + batch_size=self.args.test_bsz, args=self.args, + isTest=True, collate_fn=self.collate_fn) + + test_results = client.test(data_loader, self.model_adapter.get_model(), test_config) gc.collect() - return testResults + return test_results def client_register(self): """Register the executor information to the aggregator @@ -414,7 +356,7 @@ def event_monitor(self): logging.info("Start monitoring events ...") self.client_register() - while self.received_stop_request == False: + while not self.received_stop_request: if len(self.event_queue) > 0: request = self.event_queue.popleft() current_event = request.event @@ -438,8 +380,8 @@ def event_monitor(self): self.Test(self.deserialize_response(request.meta)) elif current_event == commons.UPDATE_MODEL: - broadcast_config = self.deserialize_response(request.data) - self.UpdateModel(broadcast_config) + model_weights = self.deserialize_response(request.data) + self.UpdateModel(model_weights) elif current_event == commons.SHUT_DOWN: self.Stop() diff --git a/fedscale/cloud/execution/rlclient.py b/fedscale/cloud/execution/rl_client.py similarity index 85% rename from fedscale/cloud/execution/rlclient.py rename to fedscale/cloud/execution/rl_client.py index cfc9755a..16e43867 100644 --- a/fedscale/cloud/execution/rlclient.py +++ b/fedscale/cloud/execution/rl_client.py @@ -1,7 +1,7 @@ import logging import math -from fedscale.cloud.execution.client import Client +from fedscale.cloud.execution.torch_client import TorchClient from fedscale.cloud.execution.optimizers import ClientOptimizer import fedscale.cloud.config_parser as parser @@ -9,7 +9,7 @@ from fedscale.dataloaders.dqn import * -class RLClient(Client): +class RLClient(TorchClient): """Basic client component in Federated Learning""" def __init__(self, conf): @@ -19,9 +19,9 @@ def __init__(self, conf): def train(self, client_data, model, conf): - clientId = conf.clientId - logging.info(f"Start to train (CLIENT: {clientId}) ...") - device = conf.device + client_id = conf.client_id + logging.info(f"Start to train (CLIENT: {client_id}) ...") + device = self.device model = model.to(device=device) # self.dqn.eval_net = self.dqn.eval_net.to(device=device) # self.dqn.target_net = self.dqn.target_net.to(device=device) @@ -78,23 +78,23 @@ def train(self, client_data, model, conf): model.load_state_dict(self.dqn.target_net.state_dict()) model_param = [param.data.cpu().numpy() for param in model.parameters()] - results = {'clientId': clientId, 'moving_loss': epoch_train_loss, + results = {'client_id': client_id, 'moving_loss': epoch_train_loss, 'trained_size': completed_steps*conf.batch_size, 'success': completed_steps > 0} results['utility'] = math.sqrt( epoch_train_loss)*float(trained_unique_samples) if error_type is None: - logging.info(f"Training of (CLIENT: {clientId}) completes, {results}") + logging.info(f"Training of (CLIENT: {client_id}) completes, {results}") else: - logging.info(f"Training of (CLIENT: {clientId}) failed as {error_type}") + logging.info(f"Training of (CLIENT: {client_id}) failed as {error_type}") results['update_weight'] = model_param results['wall_duration'] = 0 return results - def test(self, args, rank, model, device): - model = model.to(device=device) + def test(self, client_data, model, conf): + model = model.to(device=self.device) self.dqn.target_net.load_state_dict(model.state_dict()) self.dqn.set_eval_mode() env = gym.make('CartPole-v0').unwrapped @@ -112,11 +112,11 @@ def test(self, args, rank, model, device): self.dqn.store_transition(s, a, new_r, s_) reward_sum += new_r s = s_ - if self.dqn.memory_counter > args.memory_capacity: + if self.dqn.memory_counter > conf['memory_capacity']: test_loss += self.dqn.learn() if done: break logging.info('Rank {}: Test set: Average loss: {}, Reward: {}' - .format(rank, test_loss, reward_sum)) + .format(conf['rank'], test_loss, reward_sum)) return 0, 0, 0, {'top_1': reward_sum, 'top_5': reward_sum, 'test_loss': test_loss, 'test_len': 1} diff --git a/fedscale/cloud/execution/tensorflow_client.py b/fedscale/cloud/execution/tensorflow_client.py new file mode 100644 index 00000000..2b9dd924 --- /dev/null +++ b/fedscale/cloud/execution/tensorflow_client.py @@ -0,0 +1,95 @@ +import logging +import tensorflow as tf +from overrides import overrides +from fedscale.cloud.execution.client_base import ClientBase +import numpy as np + +from fedscale.cloud.internal.tensorflow_model_adapter import TensorflowModelAdapter + + +class TensorflowClient(ClientBase): + """Implements a TensorFlow-based client for training and evaluation.""" + + def __init__(self, args): + """ + Initializes a tf client. + :param args: Job args + """ + self.args = args + + def _convert_np_to_tf_dataset(self, dataset): + """ + Converts the iterable numpy dataset to a tensorflow Dataset. + :param dataset: numpy dataset + :return: tf.data.Dataset + """ + def gen(): + while True: + for x, y in dataset: + # Convert torch tensor to tf tensor + nx, ny = tf.convert_to_tensor(x.swapaxes(1, 3).numpy()), \ + tf.one_hot(tf.convert_to_tensor(y.numpy()), self.args.num_classes) + yield nx, ny + + # Sample a batch to get tensor properties + temp_x, temp_y = next(gen()) + x_shape, y_shape = temp_x.shape.as_list(), temp_y.shape.as_list() + x_shape[0], y_shape[0] = None, None + + return tf.data.Dataset.from_generator( + gen, + output_shapes=(tf.TensorShape(x_shape), tf.TensorShape(y_shape)), + output_types=(temp_x.dtype, temp_y.dtype), + ) + + @overrides + def train(self, client_data, model, conf): + """ + Perform a training task. + :param client_data: client training dataset + :param model: the framework-specific model + :param conf: job config + :return: training results + """ + client_id = conf.client_id + logging.info(f"Start to train (CLIENT: {client_id}) ...") + tf_dataset = self._convert_np_to_tf_dataset(client_data).take(conf.local_steps) + history = model.fit(tf_dataset, batch_size=conf.batch_size, verbose=1) + + # Report the training results + results = {'client_id': client_id, + 'moving_loss': sum(history.history['loss']) / (len(history.history['loss']) + 1e-4), + 'trained_size': history.history['row_count'], 'success': True, 'utility': 1} + + logging.info(f"Training of (CLIENT: {client_id}) completes, {results}") + + results['update_weight'] = [np.asarray(layer.get_weights()) for layer in model.layers if layer.trainable] + results['wall_duration'] = 0 + + return results + + @overrides + def test(self, client_data, model, conf): + """ + Perform a testing task. + :param client_data: client evaluation dataset + :param model: the framework-specific model + :param conf: job config + :return: testing results + """ + results = model.evaluate(self._convert_np_to_tf_dataset(client_data), batch_size=conf.batch_size, + return_dict=True) + for key, value in results.items(): + if key != 'row_count': + results[key] = results['row_count'] * value + results['test_len'] = results['row_count'] + return results + + @overrides + def get_model_adapter(self, model) -> TensorflowModelAdapter: + """ + Return framework-specific model adapter. + :param model: the model + :return: a model adapter containing the model + """ + return TensorflowModelAdapter(model) diff --git a/fedscale/cloud/execution/client.py b/fedscale/cloud/execution/torch_client.py similarity index 63% rename from fedscale/cloud/execution/client.py rename to fedscale/cloud/execution/torch_client.py index ea60be29..0dd83993 100644 --- a/fedscale/cloud/execution/client.py +++ b/fedscale/cloud/execution/torch_client.py @@ -1,22 +1,32 @@ import logging import math -import pickle +import time + import torch from torch.autograd import Variable +from overrides import overrides +from torch.nn import CTCLoss +from fedscale.cloud.execution.client_base import ClientBase from fedscale.cloud.execution.optimizers import ClientOptimizer +from fedscale.cloud.internal.torch_model_adapter import TorchModelAdapter from fedscale.dataloaders.nlp import mask_tokens +from fedscale.utils.model_test_module import test_pytorch_model -class Client(object): - """Basic client component in Federated Learning""" +class TorchClient(ClientBase): + """Implements a PyTorch-based client for training and evaluation.""" - def __init__(self, conf): + def __init__(self, args): + """ + Initializes a torch client. + :param args: Job args + """ + self.args = args self.optimizer = ClientOptimizer() - self.init_task(conf) - - def init_task(self, conf): - if conf.task == "detection": + self.device = args.cuda_device if args.use_cuda else torch.device( + 'cpu') + if args.task == "detection": self.im_data = Variable(torch.FloatTensor(1).cuda()) self.im_info = Variable(torch.FloatTensor(1).cuda()) self.num_boxes = Variable(torch.LongTensor(1).cuda()) @@ -24,15 +34,22 @@ def init_task(self, conf): self.epoch_train_loss = 1e-4 self.completed_steps = 0 - self.loss_squre = 0 + self.loss_squared = 0 + @overrides def train(self, client_data, model, conf): - - clientId = conf.clientId - logging.info(f"Start to train (CLIENT: {clientId}) ...") - tokenizer, device = conf.tokenizer, conf.device - - model = model.to(device=device) + """ + Perform a training task. + :param client_data: client training dataset + :param model: the framework-specific model + :param conf: job config + :return: training results + """ + client_id = conf.client_id + logging.info(f"Start to train (CLIENT: {client_id}) ...") + tokenizer = conf.tokenizer + + model = model.to(device=self.device) model.train() trained_unique_samples = min( @@ -47,7 +64,7 @@ def train(self, client_data, model, conf): criterion = self.get_criterion(conf) error_type = None - # NOTE: If one may hope to run fixed number of epochs, instead of iterations, + # NOTE: If one may hope to run fixed number of epochs, instead of iterations, # use `while self.completed_steps < conf.local_steps * len(client_data)` instead while self.completed_steps < conf.local_steps: try: @@ -59,17 +76,17 @@ def train(self, client_data, model, conf): state_dicts = model.state_dict() model_param = {p: state_dicts[p].data.cpu().numpy() for p in state_dicts} - results = {'clientId': clientId, 'moving_loss': self.epoch_train_loss, - 'trained_size': self.completed_steps*conf.batch_size, + results = {'client_id': client_id, 'moving_loss': self.epoch_train_loss, + 'trained_size': self.completed_steps * conf.batch_size, 'success': self.completed_steps == conf.local_steps} if error_type is None: - logging.info(f"Training of (CLIENT: {clientId}) completes, {results}") + logging.info(f"Training of (CLIENT: {client_id}) completes, {results}") else: - logging.info(f"Training of (CLIENT: {clientId}) failed as {error_type}") + logging.info(f"Training of (CLIENT: {client_id}) failed as {error_type}") results['utility'] = math.sqrt( - self.loss_squre)*float(trained_unique_samples) + self.loss_squared) * float(trained_unique_samples) results['update_weight'] = model_param results['wall_duration'] = 0 @@ -83,10 +100,10 @@ def get_optimizer(self, model, conf): for key, value in dict(model.named_parameters()).items(): if value.requires_grad: if 'bias' in key: - params += [{'params': [value], 'lr':lr*(cfg.TRAIN.DOUBLE_BIAS + 1), + params += [{'params': [value], 'lr': lr * (cfg.TRAIN.DOUBLE_BIAS + 1), 'weight_decay': cfg.TRAIN.BIAS_DECAY and cfg.TRAIN.WEIGHT_DECAY or 0}] else: - params += [{'params': [value], 'lr':lr, + params += [{'params': [value], 'lr': lr, 'weight_decay': cfg.TRAIN.WEIGHT_DECAY}] optimizer = torch.optim.SGD(params, momentum=cfg.TRAIN.MOMENTUM) @@ -116,10 +133,10 @@ def get_criterion(self, conf): criterion = None if conf.task == 'voice': from torch_baidu_ctc import CTCLoss - criterion = CTCLoss(reduction='none').to(device=conf.device) + criterion = CTCLoss(reduction='none').to(device=self.device) else: criterion = torch.nn.CrossEntropyLoss( - reduction='none').to(device=conf.device) + reduction='none').to(device=self.device) return criterion def train_step(self, client_data, conf, model, optimizer, criterion): @@ -128,10 +145,10 @@ def train_step(self, client_data, conf, model, optimizer, criterion): if conf.task == 'nlp': (data, _) = data_pair data, target = mask_tokens( - data, tokenizer, conf, device=conf.device) + data, tokenizer, conf, device=self.device) elif conf.task == 'voice': (data, target, input_percentages, - target_sizes), _ = data_pair + target_sizes), _ = data_pair input_sizes = input_percentages.mul_( int(data.size(3))).int() elif conf.task == 'detection': @@ -147,16 +164,16 @@ def train_step(self, client_data, conf, model, optimizer, criterion): self.gt_boxes.resize_(data[2].size()).copy_(data[2]) self.num_boxes.resize_(data[3].size()).copy_(data[3]) elif conf.task == 'speech': - data = torch.unsqueeze(data, 1).to(device=conf.device) + data = torch.unsqueeze(data, 1).to(device=self.device) elif conf.task == 'text_clf' and conf.model == 'albert-base-v2': (data, masks) = data data, masks = Variable(data).to( - device=conf.device), Variable(masks).to(device=conf.device) + device=self.device), Variable(masks).to(device=self.device) else: - data = Variable(data).to(device=conf.device) + data = Variable(data).to(device=self.device) - target = Variable(target).to(device=conf.device) + target = Variable(target).to(device=self.device) if conf.task == 'nlp': outputs = model(data, labels=target) @@ -173,19 +190,19 @@ def train_step(self, client_data, conf, model, optimizer, criterion): output = outputs.logits elif conf.task == "detection": rois, cls_prob, bbox_pred, \ - rpn_loss_cls, rpn_loss_box, \ - RCNN_loss_cls, RCNN_loss_bbox, \ - rois_label = model( - self.im_data, self.im_info, self.gt_boxes, self.num_boxes) + rpn_loss_cls, rpn_loss_box, \ + RCNN_loss_cls, RCNN_loss_bbox, \ + rois_label = model( + self.im_data, self.im_info, self.gt_boxes, self.num_boxes) loss = rpn_loss_cls + rpn_loss_box \ - + RCNN_loss_cls + RCNN_loss_bbox + + RCNN_loss_cls + RCNN_loss_bbox loss_rpn_cls = rpn_loss_cls.item() loss_rpn_box = rpn_loss_box.item() loss_rcnn_cls = RCNN_loss_cls.item() loss_rcnn_box = RCNN_loss_bbox.item() - + else: output = model(data) loss = criterion(output, target) @@ -202,16 +219,16 @@ def train_step(self, client_data, conf, model, optimizer, criterion): loss_list = loss.tolist() loss = loss.mean() - temp_loss = sum(loss_list)/float(len(loss_list)) - self.loss_squre = sum([l**2 for l in loss_list] - )/float(len(loss_list)) + temp_loss = sum(loss_list) / float(len(loss_list)) + self.loss_squared = sum([l ** 2 for l in loss_list] + ) / float(len(loss_list)) # only measure the loss of the first epoch if self.completed_steps < len(client_data): if self.epoch_train_loss == 1e-4: self.epoch_train_loss = temp_loss else: self.epoch_train_loss = ( - 1. - conf.loss_decay) * self.epoch_train_loss + conf.loss_decay * temp_loss + 1. - conf.loss_decay) * self.epoch_train_loss + conf.loss_decay * temp_loss # ========= Define the backward loss ============== optimizer.zero_grad() @@ -227,6 +244,34 @@ def train_step(self, client_data, conf, model, optimizer, criterion): if self.completed_steps == conf.local_steps: break - - def test(self, conf): - pass + @overrides + def test(self, client_data, model, conf): + """ + Perform a testing task. + :param client_data: client evaluation dataset + :param model: the framework-specific model + :param conf: job config + :return: testing results + """ + evalStart = time.time() + if self.args.task == 'voice': + criterion = CTCLoss(reduction='mean').to(device=self.device) + else: + criterion = torch.nn.CrossEntropyLoss().to(device=self.device) + test_loss, acc, acc_5, test_results = test_pytorch_model(conf.rank, model, client_data, + device=self.device, criterion=criterion, + tokenizer=conf.tokenizer) + logging.info( + "Test results: Eval_time {}, test_loss {}, test_accuracy {:.2f}%, " + "test_5_accuracy {:.2f}% \n" + .format(round(time.time() - evalStart, 4), test_loss, acc * 100., acc_5 * 100.)) + return test_results + + @overrides + def get_model_adapter(self, model) -> TorchModelAdapter: + """ + Return framework-specific model adapter. + :param model: the model + :return: a model adapter containing the model + """ + return TorchModelAdapter(model) diff --git a/fedscale/cloud/fllibs.py b/fedscale/cloud/fllibs.py index 88b6bacc..de6729c2 100644 --- a/fedscale/cloud/fllibs.py +++ b/fedscale/cloud/fllibs.py @@ -1,45 +1,22 @@ # Standard libs -import collections -import copy -import datetime -import gc import json import logging -import math import os -import pickle -import random -import re -import socket import sys -import threading -import time -from collections import OrderedDict - -import numpy -import numpy as np -# PyTorch libs -import torch -import torch.distributed as dist import torchvision.models as tormodels -from torch.autograd import Variable -from torch.multiprocessing import Process, Queue -from torch.utils.data import DataLoader from torchvision import datasets, transforms -from fedscale.cloud.aggregation.optimizers import ServerOptimizer -from fedscale.cloud.client_manager import ClientManager - # libs from fedscale import fedscale.cloud.config_parser as parser -from fedscale.dataloaders.divide_data import DataPartitioner, select_dataset +from fedscale.cloud import commons from fedscale.dataloaders.utils_data import get_data_transform -from fedscale.utils.model_test_module import test_model # FedScale model libs -from fedscale.utils.models.model_provider import get_cv_model +from fedscale.utils.models.torch_model_provider import get_cv_model +from fedscale.utils.models.tensorflow_model_provider import get_tensorflow_model tokenizer = None + def import_libs(): global tokenizer @@ -47,35 +24,34 @@ def import_libs(): global AdamW, AlbertTokenizer, AutoConfig, AutoModelWithLMHead, AutoTokenizer, MobileBertForPreTraining, load_and_cache_examples, mask_tokens from transformers import (AdamW, AlbertTokenizer, AutoConfig, - AutoModelWithLMHead, AutoTokenizer, - MobileBertForPreTraining) + AutoModelWithLMHead, AutoTokenizer, + MobileBertForPreTraining) from fedscale.dataloaders.nlp import load_and_cache_examples, mask_tokens tokenizer = AlbertTokenizer.from_pretrained( 'albert-base-v2', do_lower_case=True) elif parser.args.task == 'speech': - global numba, SPEECH, BackgroundNoiseDataset, AddBackgroundNoiseOnSTFT, DeleteSTFT,FixSTFTDimension, StretchAudioOnSTFT, TimeshiftAudioOnSTFT, ToMelSpectrogramFromSTFT, ToSTFT, ChangeAmplitude, ChangeSpeedAndPitchAudio, FixAudioLength, LoadAudio, ToMelSpectrogram, ToTensor + global numba, SPEECH, BackgroundNoiseDataset, AddBackgroundNoiseOnSTFT, DeleteSTFT, FixSTFTDimension, StretchAudioOnSTFT, TimeshiftAudioOnSTFT, ToMelSpectrogramFromSTFT, ToSTFT, ChangeAmplitude, ChangeSpeedAndPitchAudio, FixAudioLength, LoadAudio, ToMelSpectrogram, ToTensor import numba from fedscale.dataloaders.speech import SPEECH, BackgroundNoiseDataset from fedscale.dataloaders.transforms_stft import (AddBackgroundNoiseOnSTFT, - DeleteSTFT, - FixSTFTDimension, - StretchAudioOnSTFT, - TimeshiftAudioOnSTFT, - ToMelSpectrogramFromSTFT, - ToSTFT) + DeleteSTFT, + FixSTFTDimension, + StretchAudioOnSTFT, + TimeshiftAudioOnSTFT, + ToMelSpectrogramFromSTFT, + ToSTFT) from fedscale.dataloaders.transforms_wav import (ChangeAmplitude, - ChangeSpeedAndPitchAudio, - FixAudioLength, LoadAudio, - ToMelSpectrogram, - ToTensor) + ChangeSpeedAndPitchAudio, + FixAudioLength, LoadAudio, + ToMelSpectrogram, + ToTensor) elif parser.args.task == 'detection': global pickle, get_imdb, readClass, resnet, nms, bbox_transform_inv, clip_boxes, cfg, cfg_from_file, cfg_from_list, get_output_dir, adjust_learning_rate, clip_gradient, load_net, save_checkpoint, save_net, weights_normal_init, roibatchLoader, combined_roidb import pickle - from fedscale.dataloaders.rcnn.lib.datasets.factory import get_imdb from fedscale.dataloaders.rcnn.lib.datasets.pascal_voc import readClass from fedscale.dataloaders.rcnn.lib.model.faster_rcnn.resnet import resnet @@ -102,6 +78,7 @@ def import_libs(): from fedscale.dataloaders.dqn import RLData, Net, DQN + # shared functions of aggregator and clients # initiate for nlp @@ -109,7 +86,6 @@ def import_libs(): os.environ['MASTER_ADDR'] = parser.args.ps_ip os.environ['MASTER_PORT'] = parser.args.ps_port - outputClass = {'Mnist': 10, 'cifar10': 10, "imagenet": 1000, 'emnist': 47, 'amazon': 5, 'openImg': 596, 'google_speech': 35, 'femnist': 62, 'yelp': 5, 'inaturalist': 1010 } @@ -124,7 +100,7 @@ def init_model(): if parser.args.task == 'nlp': config = AutoConfig.from_pretrained( - os.path.join(parser.args.data_dir, parser.args.model+'-config.json')) + os.path.join(parser.args.data_dir, parser.args.model + '-config.json')) model = AutoModelWithLMHead.from_config(config) tokenizer = AlbertTokenizer.from_pretrained( parser.args.model, do_lower_case=True) @@ -223,11 +199,14 @@ def init_model(): elif parser.args.model == 'svm': from fedscale.utils.models.simple.models import LinearSVM model = LinearSVM(parser.args.input_dim, outputClass[parser.args.data_set]) + elif parser.args.model_zoo == "fedscale-tensorflow-zoo": + assert parser.args.engine == commons.TENSORFLOW + model = get_tensorflow_model(parser.args.model, parser.args) else: - if parser.args.model_zoo == "fedscale-zoo": + if parser.args.model_zoo == "fedscale-torch-zoo": if parser.args.task == "cv": model = get_cv_model(name=parser.args.model, - num_classes=outputClass[parser.args.data_set]) + num_classes=outputClass[parser.args.data_set]) else: raise NameError(f"Model zoo {parser.args.model_zoo} does not exist") elif parser.args.model_zoo == "torchcv": @@ -239,7 +218,6 @@ def init_model(): def init_dataset(): - import_libs() if parser.args.task == "detection": @@ -249,7 +227,8 @@ def init_dataset(): imdb, roidb, ratio_list, ratio_index = combined_roidb( imdb_name, ['DATA_DIR', parser.args.data_dir], sizes=parser.args.train_size_file) train_dataset = roibatchLoader( - roidb, ratio_list, ratio_index, parser.args.batch_size, imdb.num_classes, imdb._image_index_temp, training=True) + roidb, ratio_list, ratio_index, parser.args.batch_size, imdb.num_classes, imdb._image_index_temp, + training=True) imdb_, roidb_, ratio_list_, ratio_index_ = combined_roidb( imdbval_name, ['DATA_DIR', parser.args.data_dir], sizes=parser.args.test_size_file, training=False) imdb_.competition_mode(on=True) @@ -350,7 +329,8 @@ def init_dataset(): elif parser.args.data_set == 'google_speech': bkg = '_background_noise_' data_aug_transform = transforms.Compose( - [ChangeAmplitude(), ChangeSpeedAndPitchAudio(), FixAudioLength(), ToSTFT(), StretchAudioOnSTFT(), TimeshiftAudioOnSTFT(), FixSTFTDimension()]) + [ChangeAmplitude(), ChangeSpeedAndPitchAudio(), FixAudioLength(), ToSTFT(), StretchAudioOnSTFT(), + TimeshiftAudioOnSTFT(), FixSTFTDimension()]) bg_dataset = BackgroundNoiseDataset( os.path.join(parser.args.data_dir, bkg), data_aug_transform) add_bg_noise = AddBackgroundNoiseOnSTFT(bg_dataset) diff --git a/fedscale/cloud/internal/model_adapter_base.py b/fedscale/cloud/internal/model_adapter_base.py new file mode 100644 index 00000000..067ef91c --- /dev/null +++ b/fedscale/cloud/internal/model_adapter_base.py @@ -0,0 +1,32 @@ +import abc +from typing import Any +import numpy as np + + +class ModelAdapterBase(abc.ABC): + """ + Represents an adapter that operates on a framework-specific model. + """ + @abc.abstractmethod + def set_weights(self, weights: np.ndarray): + """ + Set the model's weights to the numpy weights array. + :param weights: numpy weights array + """ + pass + + @abc.abstractmethod + def get_weights(self) -> np.ndarray: + """ + Get the model's weights as a numpy weights array. Note that it doesn't contain layer names. Rather, index 0 + contains the model's first layer weights, and index N contains the N+1 layer's weights. + :return: A numpy array + """ + pass + + @abc.abstractmethod + def get_model(self) -> Any: + """ + Get the instantiated framework specific model including the architecture. + """ + pass diff --git a/fedscale/cloud/internal/tensorflow_model_adapter.py b/fedscale/cloud/internal/tensorflow_model_adapter.py new file mode 100644 index 00000000..5c5785a5 --- /dev/null +++ b/fedscale/cloud/internal/tensorflow_model_adapter.py @@ -0,0 +1,22 @@ +from typing import List + +import numpy as np +import tensorflow as tf + +from fedscale.cloud.internal.model_adapter_base import ModelAdapterBase + + +class TensorflowModelAdapter(ModelAdapterBase): + def __init__(self, model: tf.keras.Model): + self.model = model + + def set_weights(self, weights: List[np.ndarray]): + for i, layer in enumerate(self.model.layers): + if layer.trainable: + layer.set_weights(weights[i]) + + def get_weights(self) -> List[np.ndarray]: + return [np.asarray(layer.get_weights()) for layer in self.model.layers if layer.trainable] + + def get_model(self): + return self.model diff --git a/fedscale/cloud/internal/torch_model_adapter.py b/fedscale/cloud/internal/torch_model_adapter.py new file mode 100644 index 00000000..44e2a950 --- /dev/null +++ b/fedscale/cloud/internal/torch_model_adapter.py @@ -0,0 +1,49 @@ +from typing import List + +import numpy as np +import torch + +from fedscale.cloud.aggregation.optimizers import TorchServerOptimizer +from fedscale.cloud.internal.model_adapter_base import ModelAdapterBase + + +class TorchModelAdapter(ModelAdapterBase): + """ + Adapts functions to pytorch models. + """ + def __init__(self, model: torch.nn.Module, optimizer: TorchServerOptimizer = None): + """ + Initializes a TorchModelAdapter. + :param model: the PyTorch model to adapt + :param optimizer: the optimizer to apply weights, when specified. + """ + self.model = model + self.optimizer = optimizer + + def set_weights(self, weights: List[np.ndarray]): + """ + Set the model's weights to the numpy weights array. + :param weights: numpy weights array + """ + current_grad_weights = [param.data.clone() for param in self.model.state_dict().values()] + new_state_dict = { + name: torch.from_numpy(np.asarray(weights[i], dtype=np.float32)) + for i, name in enumerate(self.model.state_dict().keys()) + } + self.model.load_state_dict(new_state_dict) + if self.optimizer: + self.optimizer.update_round_gradient(weights, current_grad_weights, self.model) + + def get_weights(self) -> List[np.ndarray]: + """ + Get the model's weights as a numpy weights array. Note that it doesn't contain layer names. Rather, index 0 + contains the model's first layer weights, and index N contains the N+1 layer's weights. + :return: A numpy array + """ + return [params.data.clone() for params in self.model.state_dict().values()] + + def get_model(self): + """ + Get the instantiated framework specific model including the architecture. + """ + return self.model diff --git a/fedscale/cloud/logger/aggragation.py b/fedscale/cloud/logger/aggragation.py deleted file mode 100644 index 978d98d2..00000000 --- a/fedscale/cloud/logger/aggragation.py +++ /dev/null @@ -1,66 +0,0 @@ -# package for aggregator -from fedscale.cloud.fllibs import * -import fedscale.cloud.config_parser as parser - -logDir = None - - -def init_logging(): - global logDir - - logDir = os.path.join(parser.args.log_path, "logs", parser.args.job_name, - parser.args.time_stamp, 'aggregator') - logFile = os.path.join(logDir, 'log') - if not os.path.isdir(logDir): - os.makedirs(logDir, exist_ok=True) - - logging.basicConfig( - format='%(asctime)s %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s', - datefmt='(%m-%d) %H:%M:%S', - level=logging.INFO, - handlers=[ - logging.FileHandler(logFile, mode='a'), - logging.StreamHandler() - ]) - - - -def initiate_aggregator_setting(): - init_logging() - -def aggregate_test_result(test_result_accumulator, task, round_num, global_virtual_clock, testing_history): - - accumulator = test_result_accumulator[0] - for i in range(1, len(test_result_accumulator)): - if task == "detection": - for key in accumulator: - if key == "boxes": - for j in range(596): - accumulator[key][j] = accumulator[key][j] + \ - test_result_accumulator[i][key][j] - else: - accumulator[key] += test_result_accumulator[i][key] - else: - for key in accumulator: - accumulator[key] += test_result_accumulator[i][key] - if task == "detection": - testing_history['perf'][round_num] = {'round': round_num, 'clock': global_virtual_clock, - 'top_1': round(accumulator['top_1']*100.0/len(test_result_accumulator), 4), - 'top_5': round(accumulator['top_5']*100.0/len(test_result_accumulator), 4), - 'loss': accumulator['test_loss'], - 'test_len': accumulator['test_len'] - } - else: - testing_history['perf'][round_num] = {'round': round_num, 'clock': global_virtual_clock, - 'top_1': round(accumulator['top_1']/accumulator['test_len']*100.0, 4), - 'top_5': round(accumulator['top_5']/accumulator['test_len']*100.0, 4), - 'loss': accumulator['test_loss']/accumulator['test_len'], - 'test_len': accumulator['test_len'] - } - - logging.info("FL Testing in round: {}, virtual_clock: {}, top_1: {} %, top_5: {} %, test loss: {:.4f}, test len: {}" - .format(round_num, global_virtual_clock, testing_history['perf'][round_num]['top_1'], - testing_history['perf'][round_num]['top_5'], testing_history['perf'][round_num]['loss'], - testing_history['perf'][round_num]['test_len'])) - - diff --git a/fedscale/cloud/logger/aggregator_logging.py b/fedscale/cloud/logger/aggregator_logging.py new file mode 100644 index 00000000..b2ac3742 --- /dev/null +++ b/fedscale/cloud/logger/aggregator_logging.py @@ -0,0 +1,27 @@ +from fedscale.cloud.fllibs import * +import fedscale.cloud.config_parser as parser + +logDir = None + + +def init_logging(): + global logDir + + logDir = os.path.join(parser.args.log_path, "logs", parser.args.job_name, + parser.args.time_stamp, 'aggregator') + logFile = os.path.join(logDir, 'log') + if not os.path.isdir(logDir): + os.makedirs(logDir, exist_ok=True) + + logging.basicConfig( + format='%(asctime)s %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s', + datefmt='(%m-%d) %H:%M:%S', + level=logging.INFO, + handlers=[ + logging.FileHandler(logFile, mode='a'), + logging.StreamHandler() + ]) + + +def initiate_aggregator_setting(): + init_logging() diff --git a/fedscale/cloud/logger/dummy_logger.py b/fedscale/cloud/logger/dummy_logger.py new file mode 100644 index 00000000..9728324e --- /dev/null +++ b/fedscale/cloud/logger/dummy_logger.py @@ -0,0 +1,6 @@ +class DummyLogger: + def add_scalar(self, name, value, step): + pass + + def add_histogram(self, name, dist, step): + pass diff --git a/fedscale/cloud/logger/execution.py b/fedscale/cloud/logger/executor_logging.py similarity index 88% rename from fedscale/cloud/logger/execution.py rename to fedscale/cloud/logger/executor_logging.py index b4d426a2..2500da59 100644 --- a/fedscale/cloud/logger/execution.py +++ b/fedscale/cloud/logger/executor_logging.py @@ -1,16 +1,14 @@ -# package for client -import os - from fedscale.cloud.fllibs import * import fedscale.cloud.config_parser as parser logDir = None + def init_logging(): global logDir logDir = os.path.join(parser.args.log_path, "logs", parser.args.job_name, - parser.args.time_stamp, 'executor') + parser.args.time_stamp, 'executor') logFile = os.path.join(logDir, 'log') if not os.path.isdir(logDir): os.makedirs(logDir, exist_ok=True) @@ -27,5 +25,3 @@ def init_logging(): def initiate_client_setting(): init_logging() - - diff --git a/fedscale/dataloaders/divide_data.py b/fedscale/dataloaders/divide_data.py index 43c90c0b..81d2139c 100755 --- a/fedscale/dataloaders/divide_data.py +++ b/fedscale/dataloaders/divide_data.py @@ -42,7 +42,6 @@ def __init__(self, data, args, numOfClass=0, seed=10, isTest=False): np.random.seed(seed) self.data_len = len(self.data) - self.task = args.task self.numOfLabels = numOfClass self.client_label_cnt = defaultdict(set) @@ -62,8 +61,8 @@ def trace_partition(self, data_map_file): """Read data mapping from data_map_file. Format: """ logging.info(f"Partitioning data by profile {data_map_file}...") - clientId_maps = {} - unique_clientIds = {} + client_id_maps = {} + unique_client_ids = {} # load meta data from the data_map_file with open(data_map_file) as csv_file: csv_reader = csv.reader(csv_file, delimiter=',') @@ -77,19 +76,19 @@ def trace_partition(self, data_map_file): else: client_id = row[0] - if client_id not in unique_clientIds: - unique_clientIds[client_id] = len(unique_clientIds) + if client_id not in unique_client_ids: + unique_client_ids[client_id] = len(unique_client_ids) - clientId_maps[sample_id] = unique_clientIds[client_id] - self.client_label_cnt[unique_clientIds[client_id]].add( + client_id_maps[sample_id] = unique_client_ids[client_id] + self.client_label_cnt[unique_client_ids[client_id]].add( row[-1]) sample_id += 1 # Partition data given mapping - self.partitions = [[] for _ in range(len(unique_clientIds))] + self.partitions = [[] for _ in range(len(unique_client_ids))] for idx in range(sample_id): - self.partitions[clientId_maps[idx]].append(idx) + self.partitions[client_id_maps[idx]].append(idx) def partition_data_helper(self, num_clients, data_map_file=None): @@ -114,7 +113,7 @@ def uniform_partition(self, num_clients): indexes = indexes[part_len:] def use(self, partition, istest): - resultIndex = self.partitions[partition] + resultIndex = self.partitions[partition % len(self.partitions)] exeuteLength = len(resultIndex) if not istest else int( len(resultIndex) * self.args.test_ratio) diff --git a/fedscale/dataloaders/reddit.py b/fedscale/dataloaders/reddit.py index 86c0eddc..6691befc 100644 --- a/fedscale/dataloaders/reddit.py +++ b/fedscale/dataloaders/reddit.py @@ -155,7 +155,7 @@ def load_file(self, path, is_train): for client_data in client_data_list: client_list = client_data['users'] - for clientId, client in enumerate(client_list): + for client_id, client in enumerate(client_list): tokens_list = list(client_data['user_data'][client]['x']) for tokens in tokens_list: @@ -165,7 +165,7 @@ def load_file(self, path, is_train): if not tokens_list: continue - mapping_dict[count] = clientId + mapping_dict[count] = client_id text.append(tokens_list) count += 1 @@ -176,13 +176,13 @@ def load_file(self, path, is_train): #print("====In loading data, remains {} clients, may take {} sec".format(num_of_remains, (time.time() - start_time)/clientCount * num_of_remains)) # logging.info("====In loading data, remains {} clients".format(num_of_remains) - if clientId % 5000 == 0: + if client_id % 5000 == 0: # dump the cache with open(cache_path, 'wb') as fout: pickle.dump(text, fout) pickle.dump(mapping_dict, fout) - print("====Dump for {} clients".format(clientId)) + print("====Dump for {} clients".format(client_id)) # dump the cache with open(cache_path, 'wb') as fout: diff --git a/fedscale/dataloaders/stackoverflow.py b/fedscale/dataloaders/stackoverflow.py index 7252a833..7e5d2d75 100755 --- a/fedscale/dataloaders/stackoverflow.py +++ b/fedscale/dataloaders/stackoverflow.py @@ -199,7 +199,7 @@ def load_file(self, path, is_train): client_list = list(train_file['examples']) start_time = time.time() - for clientId, client in enumerate(client_list): + for client_id, client in enumerate(client_list): tags_list = list(train_file['examples'][client]['tags']) tokens_list = list(train_file['examples'][client]['tokens']) title_list = list(train_file['examples'][client]['title']) @@ -215,7 +215,7 @@ def load_file(self, path, is_train): if not tokens_list: continue - mapping_dict[count] = clientId + mapping_dict[count] = client_id text.append(tokens_list) target_tags.append(tags_list) @@ -223,18 +223,18 @@ def load_file(self, path, is_train): clientCount += 1 - num_of_remains = len(client_list) - clientId + num_of_remains = len(client_list) - client_id #print("====In loading data, remains {} clients, may take {} sec".format(num_of_remains, (time.time() - start_time)/clientCount * num_of_remains)) # logging.info("====In loading data, remains {} clients".format(num_of_remains) - if clientId % 5000 == 0: + if client_id % 5000 == 0: # dump the cache with open(cache_path, 'wb') as fout: pickle.dump(text, fout) pickle.dump(target_tags, fout) pickle.dump(mapping_dict, fout) - #print("====Dump for {} clients".format(clientId)) + #print("====Dump for {} clients".format(client_id)) # dump the cache with open(cache_path, 'wb') as fout: diff --git a/fedscale/dataloaders/utils_data.py b/fedscale/dataloaders/utils_data.py index 7f7db93f..395e8cf4 100755 --- a/fedscale/dataloaders/utils_data.py +++ b/fedscale/dataloaders/utils_data.py @@ -7,7 +7,7 @@ def get_data_transform(data: str): if data == 'mnist': train_transform = transforms.Compose([ # transforms.Grayscale(num_output_channels=1), - transforms.Resize((28, 28)), + transforms.Resize((32, 32)), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) @@ -15,7 +15,7 @@ def get_data_transform(data: str): test_transform = transforms.Compose([ # transforms.Grayscale(num_output_channels=1), - transforms.Resize((28, 28)), + transforms.Resize((32, 32)), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) diff --git a/fedscale/edge/mnn/app/src/main/java/com/fedscale/android/executor/FLExecutor.java b/fedscale/edge/mnn/app/src/main/java/com/fedscale/android/executor/FLExecutor.java index dad51b91..77a9eb06 100644 --- a/fedscale/edge/mnn/app/src/main/java/com/fedscale/android/executor/FLExecutor.java +++ b/fedscale/edge/mnn/app/src/main/java/com/fedscale/android/executor/FLExecutor.java @@ -159,7 +159,7 @@ private String deserializeResponse(ByteString responses) throws IOException { /** * Serialize the response to send to server upon assigned job completion * - * @param responses Client responses after job completion. + * @param responses TorchClient responses after job completion. * @return The serialized response object to server. */ private ByteString serializeResponse(String responses) throws IOException { diff --git a/fedscale/edge/pytorch/torch_client.py b/fedscale/edge/pytorch/torch_client.py index 569fc326..66af1cd5 100644 --- a/fedscale/edge/pytorch/torch_client.py +++ b/fedscale/edge/pytorch/torch_client.py @@ -1,8 +1,8 @@ -"""A skeleton for Pytorch Client""" -from fedscale.cloud.execution.client import Client +"""A skeleton for Pytorch TorchClient""" +from fedscale.cloud.execution.torch_client import TorchClient -class Torch_Client(Client): - """A class for PyTorch version of Client, directly inherited from fedscale/cloud/execution/client.py""" +class Torch_Client(TorchClient): + """A class for PyTorch version of TorchClient, directly inherited from fedscale/cloud/execution/torch_client.py""" pass diff --git a/fedscale/tests/cloud/aggregation/test_aggregator.py b/fedscale/tests/cloud/aggregation/test_aggregator.py new file mode 100644 index 00000000..e0e9b41f --- /dev/null +++ b/fedscale/tests/cloud/aggregation/test_aggregator.py @@ -0,0 +1,49 @@ +import copy +import numpy as np +import tensorflow as tf +import torch + +from fedscale.cloud.aggregation.aggregator import Aggregator +from fedscale.cloud.internal.tensorflow_model_adapter import TensorflowModelAdapter +from fedscale.cloud.internal.torch_model_adapter import TorchModelAdapter + + +class MockAggregator(Aggregator): + def __init__(self, model_wrapper): + self.model_weights = [] + self.model_in_update = 1 + self.tasks_round = 3 + self.model_wrapper = model_wrapper + + +def multiply_weights(weights, factor): + return [weights_group * factor for weights_group in weights] + + +class TestAggregator: + def test_update_weight_aggregation_for_keras_model(self): + x = tf.keras.Input(shape=(2,)) + y = tf.keras.layers.Dense(2, activation='softmax')( + tf.keras.layers.Dense(4, activation='softmax')(x)) + model = tf.keras.Model(x, y) + model_adapter = TensorflowModelAdapter(model) + aggregator = MockAggregator(model_adapter) + weights = copy.deepcopy(model_adapter.get_weights()) + aggregator.update_weight_aggregation(multiply_weights(weights, 2)) + aggregator.model_in_update += 1 + aggregator.update_weight_aggregation(multiply_weights(weights, 2)) + aggregator.model_in_update += 1 + aggregator.update_weight_aggregation(multiply_weights(weights, 5)) + np.array_equal(aggregator.model_wrapper.get_weights(), multiply_weights(weights, 3)) + + def test_update_weight_aggregation_for_torch_model(self): + model = torch.nn.Linear(3, 2) + model_adapter = TorchModelAdapter(model) + aggregator = MockAggregator(model_adapter) + weights = copy.deepcopy(model_adapter.get_weights()) + aggregator.update_weight_aggregation(multiply_weights(weights, 2)) + aggregator.model_in_update += 1 + aggregator.update_weight_aggregation(multiply_weights(weights, 2)) + aggregator.model_in_update += 1 + aggregator.update_weight_aggregation(multiply_weights(weights, 5)) + np.array_equal(aggregator.model_wrapper.get_weights(), multiply_weights(weights, 3)) diff --git a/fedscale/utils/model_test_module.py b/fedscale/utils/model_test_module.py index 8f74f5ae..b611f99b 100755 --- a/fedscale/utils/model_test_module.py +++ b/fedscale/utils/model_test_module.py @@ -24,6 +24,8 @@ from fedscale.dataloaders.rcnn.lib.model.utils.config import cfg from fedscale.dataloaders.rcnn.lib.roi_data_layer.roidb import \ combined_roidb +elif parser.args.task == 'voice': + from fedscale.dataloaders.decoder import GreedyDecoder def cal_accuracy(targets, outputs): @@ -50,7 +52,7 @@ def cal_accuracy(targets, outputs): return temp_acc, temp_all_or_false, temp_len -def test_model(rank, model, test_data, device='cpu', criterion=nn.NLLLoss(), tokenizer=None): +def test_pytorch_model(rank, model, test_data, device='cpu', criterion=nn.NLLLoss(), tokenizer=None): test_loss = 0 correct = 0 diff --git a/fedscale/utils/models/tensorflow_model_provider.py b/fedscale/utils/models/tensorflow_model_provider.py new file mode 100644 index 00000000..2e0fea53 --- /dev/null +++ b/fedscale/utils/models/tensorflow_model_provider.py @@ -0,0 +1,63 @@ +import tensorflow as tf + + +class RowCount(tf.keras.metrics.Metric): + def __init__(self, name='row_count', **kwargs): + super(RowCount, self).__init__(**kwargs) + self.count = self.add_weight('count', initializer='zeros') + + def update_state(self, y_true, y_pred, sample_weight=None): + self.count.assign_add(tf.reduce_sum(tf.cast(tf.shape(y_true)[0], self.dtype))) + + def reset_state(self): + self.count.assign(0) + + def result(self): + return self.count + + +def build_resnet50(args): + model = tf.keras.applications.resnet.ResNet50( + include_top=True, + weights=None, + input_tensor=None, + input_shape=args.input_shape, + pooling=None, + classes=args.num_classes + ) + optimizer = tf.keras.optimizers.SGD(learning_rate=args.learning_rate, momentum=0.9, + nesterov=False, name='SGD') + model.compile(optimizer=optimizer, loss='categorical_crossentropy', + metrics=['accuracy', RowCount()]) + return model + + +def build_mobilenet_v3_small(args): + model = tf.keras.applications.MobileNetV3Small( + input_shape=args.input_shape, + alpha=1.0, + minimalistic=False, + input_tensor=None, + weights=None, + classes=args.num_classes, + pooling=None, + dropout_rate=0.2, + include_preprocessing=True, + ) + optimizer = tf.keras.optimizers.SGD(learning_rate=args.learning_rate, momentum=0.9, + nesterov=False, name='SGD') + model.compile(optimizer=optimizer, loss='categorical_crossentropy', + metrics=['accuracy', RowCount()]) + return model + + +_models = { + 'resnet50': build_resnet50, + 'mobilenetv3_small': build_mobilenet_v3_small +} + + +def get_tensorflow_model(name: str, args): + if name not in _models: + raise f"{name} is not a tensorflow-supported model in FedScale. Please add implementation to the _models dict." + return _models[name](args) diff --git a/fedscale/utils/models/model_provider.py b/fedscale/utils/models/torch_model_provider.py similarity index 100% rename from fedscale/utils/models/model_provider.py rename to fedscale/utils/models/torch_model_provider.py diff --git a/thirdparty/oort/oort.py b/thirdparty/oort/oort.py index 9903fbc5..7f1346dd 100644 --- a/thirdparty/oort/oort.py +++ b/thirdparty/oort/oort.py @@ -152,17 +152,17 @@ def __init__(self, args, sample_seed=233): np2.random.seed(sample_seed) - def register_client(self, clientId, feedbacks): + def register_client(self, client_id, feedbacks): # Initiate the score for arms. [score, time_stamp, # of trials, size of client, auxi, duration] - if clientId not in self.totalArms: - self.totalArms[clientId] = {} - self.totalArms[clientId]['reward'] = feedbacks['reward'] - self.totalArms[clientId]['duration'] = feedbacks['duration'] - self.totalArms[clientId]['time_stamp'] = self.training_round - self.totalArms[clientId]['count'] = 0 - self.totalArms[clientId]['status'] = True + if client_id not in self.totalArms: + self.totalArms[client_id] = {} + self.totalArms[client_id]['reward'] = feedbacks['reward'] + self.totalArms[client_id]['duration'] = feedbacks['duration'] + self.totalArms[client_id]['time_stamp'] = self.training_round + self.totalArms[client_id]['count'] = 0 + self.totalArms[client_id]['status'] = True - self.unexplored.add(clientId) + self.unexplored.add(client_id) def calculateSumUtil(self, clientList): cnt, cntUtil = 1e-4, 0 @@ -207,20 +207,20 @@ def pacer(self): logging.info("Training selector: Pacer {}: lastExploitationUtil {}, lastExplorationUtil {}, last_util_record {}". format(self.training_round, lastExploitationUtil, lastExplorationUtil, self.last_util_record)) - def update_client_util(self, clientId, feedbacks): + def update_client_util(self, client_id, feedbacks): ''' @ feedbacks['reward']: statistical utility @ feedbacks['duration']: system utility @ feedbacks['count']: times of involved ''' - self.totalArms[clientId]['reward'] = feedbacks['reward'] - self.totalArms[clientId]['duration'] = feedbacks['duration'] - self.totalArms[clientId]['time_stamp'] = feedbacks['time_stamp'] - self.totalArms[clientId]['count'] += 1 - self.totalArms[clientId]['status'] = feedbacks['status'] + self.totalArms[client_id]['reward'] = feedbacks['reward'] + self.totalArms[client_id]['duration'] = feedbacks['duration'] + self.totalArms[client_id]['time_stamp'] = feedbacks['time_stamp'] + self.totalArms[client_id]['count'] += 1 + self.totalArms[client_id]['status'] = feedbacks['status'] - self.unexplored.discard(clientId) - self.successfulClients.add(clientId) + self.unexplored.discard(client_id) + self.successfulClients.add(client_id) def get_blacklist(self): @@ -230,9 +230,9 @@ def get_blacklist(self): sorted_client_ids = sorted(list(self.totalArms), reverse=True, key=lambda k:self.totalArms[k]['count']) - for clientId in sorted_client_ids: - if self.totalArms[clientId]['count'] > self.args.blacklist_rounds: - blacklist.append(clientId) + for client_id in sorted_client_ids: + if self.totalArms[client_id]['count'] > self.args.blacklist_rounds: + blacklist.append(client_id) else: break @@ -252,9 +252,9 @@ def select_participant(self, num_of_clients, feasible_clients=None): viable_clients = feasible_clients if feasible_clients is not None else set([x for x in self.totalArms.keys() if self.totalArms[x]['status']]) return self.getTopK(num_of_clients, self.training_round+1, viable_clients) - def update_duration(self, clientId, duration): - if clientId in self.totalArms: - self.totalArms[clientId]['duration'] = duration + def update_duration(self, client_id, duration): + if client_id in self.totalArms: + self.totalArms[client_id]['duration'] = duration def getTopK(self, numOfSamples, cur_time, feasible_clients): self.training_round = cur_time @@ -279,11 +279,11 @@ def getTopK(self, numOfSamples, cur_time, feasible_clients): moving_reward, staleness, allloss = [], [], {} - for clientId in orderedKeys: - if self.totalArms[clientId]['reward'] > 0: - creward = self.totalArms[clientId]['reward'] + for client_id in orderedKeys: + if self.totalArms[client_id]['reward'] > 0: + creward = self.totalArms[client_id]['reward'] moving_reward.append(creward) - staleness.append(cur_time - self.totalArms[clientId]['time_stamp']) + staleness.append(cur_time - self.totalArms[client_id]['time_stamp']) max_reward, min_reward, range_reward, avg_reward, clip_value = self.get_norm(moving_reward, self.args.clip_bound) @@ -325,11 +325,11 @@ def getTopK(self, numOfSamples, cur_time, feasible_clients): cut_off_util = scores[sortedClientUtil[exploitLen]] * self.args.cut_off_util tempPickedClients = [] - for clientId in sortedClientUtil: + for client_id in sortedClientUtil: # we want at least 10 times of clients for augmentation - if scores[clientId] < cut_off_util and len(tempPickedClients) > 10.*exploitLen: + if scores[client_id] < cut_off_util and len(tempPickedClients) > 10.*exploitLen: break - tempPickedClients.append(clientId) + tempPickedClients.append(client_id) augment_factor = len(tempPickedClients) @@ -364,10 +364,10 @@ def getTopK(self, numOfSamples, cur_time, feasible_clients): pickedClients = self.exploreClients + self.exploitClients top_k_score = [] for i in range(min(3, len(pickedClients))): - clientId = pickedClients[i] - _score = (self.totalArms[clientId]['reward'] - min_reward)/range_reward - _staleness = self.alpha*((cur_time-self.totalArms[clientId]['time_stamp']) - min_staleness)/float(range_staleness) #math.sqrt(0.1*math.log(cur_time)/max(1e-4, self.totalArms[clientId]['time_stamp'])) - top_k_score.append((self.totalArms[clientId], [_score, _staleness])) + client_id = pickedClients[i] + _score = (self.totalArms[client_id]['reward'] - min_reward)/range_reward + _staleness = self.alpha*((cur_time-self.totalArms[client_id]['time_stamp']) - min_staleness)/float(range_staleness) #math.sqrt(0.1*math.log(cur_time)/max(1e-4, self.totalArms[client_id]['time_stamp'])) + top_k_score.append((self.totalArms[client_id], [_score, _staleness])) logging.info("At round {}, UCB exploited {}, augment_factor {}, exploreLen {}, un-explored {}, exploration {}, round_threshold {}, sampled score is {}" .format(cur_time, numOfExploited, augment_factor/max(1e-4, exploitLen), exploreLen, len(self.unexplored), self.exploration, self.round_threshold, top_k_score)) diff --git a/thirdparty/oort/utils/lp.py b/thirdparty/oort/utils/lp.py index 22d7680f..0b7ee083 100644 --- a/thirdparty/oort/utils/lp.py +++ b/thirdparty/oort/utils/lp.py @@ -42,13 +42,13 @@ def select_by_sorted_num(raw_datas, pref, budget): top_k_indices = sorted(feasible_clients, reverse=True, key=lambda k:sum_of_cols[k]) - for idx, clientId in enumerate(top_k_indices): + for idx, client_id in enumerate(top_k_indices): # Take this client, and reduce the preference by the capacity of each class on this client tempTakenSamples = {} for cl in listOfInterest: - takenSamples = min(preference[cl], datas[clientId][cl]) + takenSamples = min(preference[cl], datas[client_id][cl]) preference[cl] -= takenSamples if preference[cl] == 0: @@ -57,8 +57,8 @@ def select_by_sorted_num(raw_datas, pref, budget): tempTakenSamples[cl] = takenSamples - datas[clientId, :] = 0 - clientsTaken[clientId] = tempTakenSamples + datas[client_id, :] = 0 + clientsTaken[client_id] = tempTakenSamples if interestChanged: break @@ -132,7 +132,7 @@ def run_select_by_category(request_list, data_distribution, client_info, budget, #logging.info(f"Testing Selector: Augmenting the cut_off_clients to {cut_off_clients} in heuristic") augTime = time.time() - start_time - #logging.info(f"Testing Selector: Client augmentation took {augTime:.2f} sec to pick {len(select_clients)} clients") + #logging.info(f"Testing Selector: TorchClient augmentation took {augTime:.2f} sec to pick {len(select_clients)} clients") select_client_list = list(select_clients.keys())