-
Notifications
You must be signed in to change notification settings - Fork 12
/
Copy pathjob.json
58 lines (58 loc) · 23.1 KB
/
job.json
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
{
"jobid": "bf3d5456-fba9-42b1-9b55-119366b409ee",
"cmdline": {
"appPart": "python3 tmp-10491-mEkonq1toz76 -v -b ${S3_BUCKET} -m ${S3_OBJECTMODEL} -g ${S3_OBJECTGLUEDATA} -t WNLI -M -s 40 41 42 43",
"systemPart": "ray job submit --runtime-env=/var/folders/2k/7mgd1tq55gdbghf0xkl2t_l80000gp/T/tmp-10491-BamAl1LRY98R --job-id ${JOB_ID} --no-wait"
},
"runtimeEnv": {
"env_vars": {
"JOB_ID": "bf3d5456-fba9-42b1-9b55-119366b409ee",
"S3_PROVIDER": "aws",
"S3_ENDPOINT": "https://s3.amazonaws.com",
"S3_ACCESS_KEY_ID": "********",
"AWS_ACCESS_KEY_ID": "********",
"S3_SECRET_ACCESS_KEY": "********",
"AWS_SECRET_ACCESS_KEY": "********",
"MC_CONFIG_DIR": "/var/folders/2k/7mgd1tq55gdbghf0xkl2t_l80000gp/T/tmp.8btLMMUx",
"S3_BUCKETRAYLOGS": "browsey",
"S3_FILEPATHRAYLOGS": "RAYLOGS",
"LOGDIR_STAGE": "/var/folders/2k/7mgd1tq55gdbghf0xkl2t_l80000gp/T/logdir-stage.bj5hQCjp",
"STREAMCONSUMER_LOGS": "/var/folders/2k/7mgd1tq55gdbghf0xkl2t_l80000gp/T/logdir-stage.bj5hQCjp/logs/",
"STREAMCONSUMER_EVENTS": "/var/folders/2k/7mgd1tq55gdbghf0xkl2t_l80000gp/T/logdir-stage.bj5hQCjp/events/",
"STREAMCONSUMER_RESOURCES": "/var/folders/2k/7mgd1tq55gdbghf0xkl2t_l80000gp/T/logdir-stage.bj5hQCjp/resources/",
"S3_LOGDIR": "browsey/codeflare/bf3d5456-fba9-42b1-9b55-119366b409ee",
"LOGDIR_URI": "s3://browsey/codeflare/bf3d5456-fba9-42b1-9b55-119366b409ee",
"LOGDIR_MC": "s3/browsey/codeflare/bf3d5456-fba9-42b1-9b55-119366b409ee",
"PATH": "/Users/nickm/.local/bin:/usr/local/opt/[email protected]/Frameworks/Python.framework/Versions/3.9/bin:/Users/nickm/.krew/bin:/Library/Frameworks/Python.framework/Versions/2.7/bin:/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin:/Applications/VMware Fusion.app/Contents/Public:/Library/TeX/texbin:/usr/local/go/bin:/Library/Apple/usr/bin:/Users/nickm/bin",
"KUBE_CONTEXT": "default/api-codeflare-train-v11-codeflare-openshift-com:6443/kube:admin",
"KUBE_NS": "nvidia-gpu-operator",
"KUBE_POD_LABEL_SELECTOR": "ray-user-node-type=rayWorkerType",
"KUBE_POD_RAY_HEAD_LABEL_SELECTOR": "ray-user-node-type=rayHeadType",
"KUBE_PODFULL_LABEL_SELECTOR": "ray-node-type",
"NUM_CPUS": "1",
"NUM_GPUS": "1",
"MIN_WORKERS": "1",
"MAX_WORKERS": "1",
"WORKER_MEMORY": "32Gi",
"HEAD_MEMORY": "32Gi",
"RAY_OPERATOR_IMAGE": "rayproject/ray:1.13.0-py37",
"RAY_IMAGE": "rayproject/ray-ml:1.13.0-py37-gpu",
"HELM_CLONE_TEMPDIR": "/var/folders/2k/7mgd1tq55gdbghf0xkl2t_l80000gp/T/tmp.LPMYQyK4",
"RAY_KUBE_CLUSTER_NAME": "mycluster",
"RAY_KUBE_PORT": "8508",
"KUI_RAY_ADDRESS": "http://127.0.0.1:8508",
"S3_BUCKET": "browsey",
"S3_FILEPATH": "browsey",
"S3_OBJECTMODEL": "roberta-base",
"S3_FILEPATHMODEL": "browsey/roberta-base",
"S3_OBJECTGLUEDATA": "glue_data",
"S3_FILEPATHGLUEDATA": "browsey/glue_data",
"WANDB_CONFIG_DIR": "/tmp",
"WANDB_DISABLED": "true"
},
"working_dir": "/var/folders/2k/7mgd1tq55gdbghf0xkl2t_l80000gp/T/tmp-10491-pvWSI0jqUa2s",
"pip": ["boto3", "ray[default]", "ray_lightning", "pytorch_lightning", "torchvision", "transformers==3.0.2"]
},
"language": "python",
"source": "# Copyright 2021 IBM Corp.\n\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n\n# http://www.apache.org/licenses/LICENSE-2.0\n\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\nimport os\nimport sys\nimport time\nimport datetime\nimport tempfile\nimport boto3\nimport tarfile\nimport subprocess\nimport ray\nimport json\nimport argparse\nfrom glob import glob\nimport logging\nimport socket\nimport re\n\n\n# ------------ validate S3 -----------\n# Hard to diagnose without these checks\n\ndef Validate_S3(logger,bucket,model,gluedata):\n param = os.environ.get('S3_ACCESS_KEY_ID')\n if param == None:\n logger.warning(\"S3_ACCESS_KEY_ID is missing from environment\")\n return False\n param = os.environ.get('S3_SECRET_ACCESS_KEY')\n if param == None:\n logger.warning(\"S3_SECRET_ACCESS_KEY is missing from environment\")\n return False\n\n\n client = boto3.client(\n 's3',\n aws_access_key_id = os.environ.get('S3_ACCESS_KEY_ID'),\n aws_secret_access_key = os.environ.get('S3_SECRET_ACCESS_KEY'),\n endpoint_url = os.environ.get('S3_ENDPOINT')\n )\n\n try:\n check = client.head_bucket(Bucket=bucket)\n except Exception as e:\n logger.warning(f\"bucket={bucket} not found\")\n return False\n\n try:\n check = client.head_object(Bucket=bucket, Key=model)\n except Exception as e:\n logger.warning(f\"key={model} not found in bucket={bucket}\")\n return False\n\n try:\n check = client.head_object(Bucket=bucket, Key=gluedata)\n except Exception as e:\n logger.warning(f\"key={gluedata} not found in bucket={bucket}\")\n return False\n\n logger.info(f\"S3 data looks good in bucket={bucket}\")\n return True\n\n\n# ------------ detached ray actor: DataRefs -----------\n# pulls data from S3 and caches in Plasma for local scaleout\n# returns objref for data previously cached\n# S3 credentials must be defined in the env\n\[email protected]\nclass DataRefs:\n def __init__(self,bucket):\n self.state = {}\n self.refs = {}\n self.bucket = bucket\n self.client = boto3.client(\n 's3',\n aws_access_key_id = os.environ.get('S3_ACCESS_KEY_ID'),\n aws_secret_access_key = os.environ.get('S3_SECRET_ACCESS_KEY'),\n endpoint_url = os.environ.get('S3_ENDPOINT')\n )\n\n # check if data for key is already cached\n # if not, try to get data from s3 and put it in plasma\n def Get_dataref(self,key):\n if key in self.state:\n if self.state[key] == 'Cached':\n return self.refs[key]\n print(f\" Data: try remote get {key} from s3\")\n try:\n dataobject = self.client.get_object(Bucket=self.bucket, Key=key)\n data = dataobject['Body'].read()\n print(f\" Data: done remote get {key} from s3\")\n print(f\" Data: try cache put {key} into plasma\")\n self.refs[key] = ray.put(data)\n self.state[key] = 'Cached'\n print(f\" Data: done cache put {key} data into plasma\")\n return self.refs[key]\n except Exception as e:\n print(\"Unable to retrieve/put object contents: {0}\\n\\n\".format(e))\n self.state[key] = 'Failed'\n return None\n\n def Get_state(self):\n return self.state\n\n\n# ------------ Fetch dataref into plasma -----------\n# Calls actor to get objref of S3 data cached in Plasma\ndef Fetch_data_to_cache(logger,dataRefs,key):\n try:\n st = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')\n logger.info(f\"{st} Get {key} data reference from data actor\")\n ref = ray.get(dataRefs.Get_dataref.remote(key))\n if ref == None:\n logger.warning(f\"Could not get {key} data reference from data actor\")\n return False\n return True\n\n except Exception as e:\n logger.warning(f\"Unable to retrieve {key} dataset: {0}\".format(e))\n return False\n\n# ------------ Fetch data to local dir -----------\n# pulls data from Plasma and unpack in local directory\ndef Fetch_data_to_local_dir(logger,dataRefs,key):\n if not Fetch_data_to_cache(logger,dataRefs,key):\n return False\n try:\n time_start = time.time()\n st = datetime.datetime.fromtimestamp(time_start).strftime('%Y-%m-%d %H:%M:%S')\n logger.info(f\"{st} Data: try cache get {key}\")\n ref = ray.get(dataRefs.Get_dataref.remote(key))\n if ref == None:\n logger.warning(f\"Could not get {key} data reference from data actor\")\n return False\n\n dataset = ray.get(ref)\n time_done = time.time()\n st = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')\n logger.info(f\"{st} Data: done cache get {key} length={len(dataset)} took {time_done-time_start:.2f}s\")\n tmpdata = f\"/tmp/{key}.tgz\"\n f = open(tmpdata, \"wb\")\n f.write(dataset)\n f.close\n\n logger.info(f\"{st} Data: try unpack {key} tarfile\")\n time_start = time.time()\n file = tarfile.open(tmpdata)\n file.extractall('./')\n file.close()\n time_done = time.time()\n st = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')\n logger.info(f\"{st} Data: done unpack {key} tarfile took {time_done-time_start:.2f}s\")\n return True\n\n except Exception as e:\n logger.warning(f\"Unable to retrieve/unpack {key} dataset: {0}\".format(e))\n return False\n\n\n# -------------------- Process_task -----------------\n# process_task first checks if the glue datasets and the model to test are present\n# if not, it requests the data to be fetched from plasma and unpacked locally\n# Two log streams are created: a debug level stream to stdout and an info level to file\n# The results are packed into a python hashmap and returned\n\[email protected](num_gpus=1)\ndef Process_task(dataRefs,bucket,model,gluedata,task,seed,LR,savemodel):\n # clean and recreate result directory\n resultdir = ResultDir(model,task,seed,LR)\n subprocess.run(['rm', '-rf', resultdir])\n subprocess.run(['mkdir', '-p', resultdir])\n\n # create console handler at DEBUG and logfile hander at INFO\n logger = logging.getLogger(__name__)\n logger.setLevel(logging.DEBUG)\n consoleHandler = logging.StreamHandler(sys.stdout)\n consoleHandler.setLevel(logging.DEBUG)\n logger.addHandler(consoleHandler)\n fileHandler = logging.FileHandler(f\"{resultdir}/log.log\")\n fileHandler.setLevel(logging.INFO)\n logger.addHandler(fileHandler)\n\n # print node and GPU IDs\n node_id = str(ray.get_runtime_context().node_id)\n GPU_id = str(ray.get_gpu_ids())\n logger.info(f\"Node ID: {node_id}\")\n logger.info(f\"GPU IDs: {GPU_id}\")\n\n # Reuse local glue data directory or try to create it\n if not os.path.isdir('./'+gluedata):\n if not Fetch_data_to_local_dir(logger, dataRefs, gluedata):\n return ['ERROR',f\"Fetch_data_to_local_dir for {gluedata} failed\"]\n else:\n logger.info(\"Reusing previous existing glue-dataset\")\n\n # Reuse local model directory or try to create it\n if not os.path.isdir('./'+model):\n if not Fetch_data_to_local_dir(logger, dataRefs, model):\n return ['ERROR',f\"Fetch_data_to_local_dir for {model} failed\"]\n else:\n logger.info(f\"Reusing {model} directory\")\n\n logger.info(f\"Processing task {task} seed {seed} with model {model}\")\n\n # Pull run_glue.py into local pod\n # This code version must match the transformer version being used\n if not os.path.isfile('./run_glue.py'):\n subprocess.run(['wget', 'https://raw.githubusercontent.com/huggingface/transformers/b0892fa0e8df02d683e05e625b3903209bff362d/examples/text-classification/run_glue.py'])\n\n # change location of transformer cache to a writable directory\n os.environ['TRANSFORMERS_CACHE'] = '/tmp/cache/'\n\n runargs = [\"python\",\"./run_glue.py\"]\n runargs.extend([\"--model_name_or_path\",model])\n runargs.extend([\"--task_name\",task])\n runargs.extend([\"--do_train\",\"--do_eval\"])\n runargs.extend([\"--data_dir\",f\"{gluedata}/{task}\"])\n runargs.extend([\"--max_seq_length\",\"128\"])\n runargs.extend([\"--per_device_train_batch_size\",\"32\"])\n runargs.extend([\"--learning_rate\",LR])\n runargs.extend([\"--num_train_epochs\",\"6.0\"])\n runargs.extend([\"--save_steps\",\"50000\"])\n runargs.extend([\"--save_total_limit\",\"0\"])\n runargs.extend([\"--seed\",seed])\n runargs.extend([\"--overwrite_output_dir\",\"--output_dir\",resultdir])\n\n # use this regex to exclude debug content from logfile\n p = re.compile(r\".*(Epoch|Iteration|Evaluation): .*(s/it|it/s)].*\")\n\n # finally, do the work\n time_start = time.time()\n proc = subprocess.Popen(runargs,stdout=subprocess.PIPE, stderr=subprocess.STDOUT,universal_newlines=True)\n for line in proc.stdout:\n if re.match(p,line) is None:\n if not line == \"\\n\":\n logger.info(line.rstrip())\n else:\n logger.debug(line.rstrip())\n proc.wait()\n time_proc = time.time()-time_start\n\n # flush logfile\n logger.removeHandler(consoleHandler)\n logger.removeHandler(fileHandler)\n del logger, consoleHandler, fileHandler\n\n results = PackResults(model,task,seed,LR,time_proc,savemodel)\n\n # clean up local result directory\n subprocess.run(['rm', '-rf', resultdir])\n\n return results\n\n\n# ------------------ Return remote result directory name\ndef ResultDir(model,task,seed,LR):\n taskl = task.lower()\n return f\"result/{model}/{task}/lr-{LR}/{taskl}_seed-{seed}_lr-{LR}_TBATCH-32\"\n\n\n# ------------------ PackResults\n# Puts selected info, files, and optionally a reference to the generated subtask model, into a python hashmap\n\ndef PackResults(model,task,seed,LR,time,savemodel):\n dir = ResultDir(model,task,seed,LR)\n files = glob(os.path.join(dir, f\"eval_results_*.txt\"))\n files.append(os.path.join(dir, \"log.log\"))\n taskres = {}\n taskres[\"model\"] = model\n taskres[\"LR\"] = LR\n taskres[\"task\"] = task\n taskres[\"seed\"] = seed\n taskres[\"time\"] = time\n taskres[\"hostname\"] = socket.gethostname()\n for f in files:\n with open(f, \"rb\") as afile:\n data = afile.read()\n taskres[os.path.basename(f)] = data\n\n # put the model in plasma and reference in hashmap\n if savemodel:\n f = os.path.join(dir, \"pytorch_model.bin\")\n if os.path.isfile(f):\n with open(f, \"rb\") as afile:\n data = afile.read()\n taskres[\"pytorch_model.bin\"] = ray.put(data)\n\n return taskres\n\n\n# ------------------ Return local result directory name\ndef SummaryDir(model,LR,task,seed):\n if seed == None:\n return f\"/tmp/summary/{model}_lr-{LR}/{task}\"\n else:\n return f\"/tmp/summary/{model}_lr-{LR}/{task}/seed-{seed}\"\n\n\n# ------------------ Best_model ----------------\n# checks if this is best model yet for task. If so delete last model and return eval score\ndef Best_model(model,LR,task,seed):\n # per task metric for evaluating best model (from Masayasu Muraoka)\n eval_metric = {\n \"cola\": \"mcc\", \"mnli\": \"mnli/acc\", \"sst-2\": \"acc\", \"sts-b\": \"corr\",\n \"qqp\": \"acc_and_f1\", \"qnli\": \"acc\", \"rte\": \"acc\", \"wnli\": \"acc\",\n \"mrpc\": \"f1\"\n }\n subtasks_dir = SummaryDir(model,LR,task,None)\n new_subtask_dir = SummaryDir(model,LR,task,seed)\n metric = eval_metric[task.lower()]\n grppr = \"eval_\"+metric+\" = \"\n best_score = 0\n bin_dirs = []\n # scan all subtasks for this task, get new score and best previous score\n for f in os.listdir(subtasks_dir):\n if os.path.exists(f\"{subtasks_dir}/{f}/pytorch_model.bin\"):\n bin_dirs.append(f\"{subtasks_dir}/{f}/pytorch_model.bin\")\n\n with open(f\"{subtasks_dir}/{f}/eval_results_{task.lower()}.txt\") as fp:\n for line in fp:\n if line.startswith(grppr):\n score = float(line.split(grppr)[1])\n if f\"{subtasks_dir}/{f}\" == new_subtask_dir:\n new_score = score\n else:\n if score > best_score:\n best_score = score\n\n if new_score <= best_score:\n return False, 0\n # remove previous best model\n for f in bin_dirs:\n os.remove(f)\n return True, new_score\n\n\n# ------------------ Save models = true, Check for previous saved models ----------------\n# checks if there are any specified tasks having previous subtasks with no models saved\n# if this is the case no new score may be the best score and no new model would be saved\ndef Check_for_previous_models(model,LR,tasks):\n for task in tasks:\n subtasks_dir = SummaryDir(model,LR,task,None)\n if not os.path.exists(subtasks_dir):\n continue\n # scan all subtasks for this task and see if there are completed subtasks but no models saved\n if any (os.path.exists(f\"{subtasks_dir}/{f}/eval_results_{task.lower()}.txt\") for f in os.listdir(subtasks_dir)):\n if not any (os.path.exists(f\"{subtasks_dir}/{f}/pytorch_model.bin\") for f in os.listdir(subtasks_dir)):\n logger.warning(f\"WARNING: completed subtasks for {task} exist but no previous models saved. May not save best/any model.\")\n return\n\n\n# -------------------- MAIN ------------------\nparser = argparse.ArgumentParser(description='Driver for run_glue')\nparser.add_argument('-m',\"--model\", required=True,\n help=\"S3 Key and local directory name of base model, e.g. roberta-base\")\nparser.add_argument('-g',\"--gluedata\", default=\"glue_data\",\n help=\"S3 key and local directory name of glue dataset (Default=glue_data)\")\nparser.add_argument('-b',\"--bucket\", required=True, help=\"S3 bucket name\")\nparser.add_argument('-t','--tasks', nargs='+',\n # required MRPC data missing from public download\n # help=\"tasks to run, e.g. -t WNLI CoLA (Default=WNLI STS-B CoLA RTE MRPC SST-2 MNLI QNLI QQP)\",\n # default=['WNLI','STS-B','CoLA','RTE','MRPC','SST-2','MNLI','QNLI','QQP'], action='store')\n help=\"tasks to run, e.g. -t WNLI CoLA (Default=WNLI STS-B CoLA RTE SST-2 MNLI QNLI QQP)\",\n default=['WNLI','STS-B','CoLA','RTE','SST-2','MNLI','QNLI','QQP'], action='store')\nparser.add_argument('-s','--seeds', nargs='+', default=list(range(38,48)), action='store',\n help=\"seeds to run, e.g. -s 38 39 (Default=38 39 40 41 42 43 44 45 46 47)\")\nparser.add_argument('-l',\"--learning_rate\", default=\"2e-5\",help=\"Learning Rate (Default=2e-5)\")\nparser.add_argument('-M',\"--savemodel\", action='store_true',help=\"Save best scoring model for each task (Default=False)\")\nparser.add_argument('-r',\"--ray\", default=\"glue-cluster-ray-head:10001\",help=\"ray_service:port\")\nparser.add_argument('-v',\"--verbose\", action='store_true',help=\"show remote consoles (Default=False)\")\nargs = parser.parse_args()\n\nmodel=args.model\ngluedata=args.gluedata\nbucket=args.bucket\ntasks=args.tasks\nseeds=[str(x) for x in args.seeds]\nLR=args.learning_rate\nsavemodel=args.savemodel\nray_service=args.ray\nverbose=args.verbose\n\n# create logger for driver stdout and logfile\nlogger = logging.getLogger(__name__)\nlogger.setLevel(logging.INFO)\nconsoleHandler = logging.StreamHandler(sys.stdout)\nconsoleHandler.setLevel(logging.INFO)\nlogger.addHandler(consoleHandler)\nfileHandler = logging.FileHandler(\"/tmp/gluejob.console\")\nfileHandler.setLevel(logging.INFO)\nlogger.addHandler(fileHandler)\n\nst = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')\nlogger.info(f\"\\n{st} Starting Glue benchmark ---------------\")\nlogger.info(f\"model: {model}\")\nlogger.info(f\"gluedata: {gluedata}\")\nlogger.info(f\"bucket: {bucket}\")\nlogger.info(f\"tasks: {' '.join(tasks)}\")\nlogger.info(f\"seeds: {' '.join(seeds)}\")\nlogger.info(f\"learning_rate: {float(LR)}\")\nlogger.info(f\"savemodel: {savemodel}\")\nlogger.info(f\"ray_service: {ray_service}\")\n\n# if savemodel=True, check if there are saved subtasks with no saved model and warn user\nif savemodel == True:\n Check_for_previous_models(model,LR,tasks)\n\n# connect to ray cluster\n#ray.init(\"ray://\"+ray_service,log_to_driver=verbose,namespace=\"nvidia-gpu-operator\")\nray.init(address=\"auto\", log_to_driver=verbose, namespace=\"nvidia-gpu-operator\")\n\n# check if S3 credentials are set and objects look accessible\nif not Validate_S3(logger,bucket,model,gluedata):\n logger.error(f\"Fatal error verifying S3 access to specified objects\")\n sys.exit()\n\n# create data actor if not yet exists\n# namespace is required to find a previously persisted actor instance\ndata_actor_name = 'DataRefsActor'\nnames = ray.util.list_named_actors()\nif any(x == data_actor_name for x in names):\n dataRefs = ray.get_actor(data_actor_name)\n state = ray.get(dataRefs.Get_state.remote())\n logger.info(f\" Found actor={data_actor_name} with state {state}\")\nelse:\n logger.info(f\" actor={data_actor_name} not found ... deploy it\")\n dataRefs = DataRefs.options(name=data_actor_name,lifetime=\"detached\").remote(bucket)\n\n# make sure required datasets are cached in actor\nactorstate = ray.get(dataRefs.Get_state.remote())\ngluecached = modelcached = True\nif not actorstate.get(gluedata) == 'Cached':\n gluecached = Fetch_data_to_cache(logger,dataRefs,gluedata)\nif not actorstate.get(model) == 'Cached':\n modelcached = Fetch_data_to_cache(logger,dataRefs,model)\nif not gluecached or not modelcached:\n logger.error(f\"Fatal error caching dataset from S3\")\n sys.exit()\n\n# submit all subtasks at the same time\ntasks = [Process_task.remote(dataRefs,bucket,model,gluedata,task,str(seed),LR,savemodel) for task in tasks for seed in seeds]\nst = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')\nlogger.info(f\"{st} Submitted {len(tasks)} subtasks\")\n\n# wait for all to be done, one at a time\n# TODO handle remote processing exceptions\nincomplete = tasks\ncomplete = []\nwhile len(complete) < len(tasks):\n onedone, incomplete = ray.wait(incomplete, num_returns=1, timeout=None)\n results = ray.get(onedone)\n complete.append(onedone)\n taskres = results[0]\n \n st = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')\n if \"ERROR\" in taskres:\n logger.error(f\"{st} Fatal error: {taskres['ERROR']}\")\n sys.exit()\n\n # check for valid result\n if any(x.startswith('eval_results') for x in taskres):\n logger.info(f\"{st} {taskres['model']} lr-{taskres['LR']} {taskres['task']} seed-{taskres['seed']}\"+\n f\" took {taskres['time']:.1f}s on {taskres['hostname']} ... {len(complete)} of {len(tasks)} subtasks done\")\n else:\n logger.error(f\"{st} {taskres['model']} lr-{taskres['LR']} {taskres['task']} seed-{taskres['seed']}\"+\n f\" returned ERROR ... {len(complete)} of {len(tasks)} subtasks done\")\n\n # copy results to a known place for access from outside pod; Remove any leftover files\n outfolder = SummaryDir(taskres['model'],taskres['LR'],taskres['task'],taskres['seed'])\n subprocess.run(['mkdir', '-p', outfolder])\n subprocess.run(['rm', '-rf', outfolder+\"/*\"])\n\n score = None\n for key in taskres.keys():\n if key == 'model' or key == 'LR' or key == 'task' or key == 'seed' or key == 'time' or key == 'hostname':\n continue\n if not key == 'pytorch_model.bin':\n f = open(outfolder+'/'+key, \"wb\")\n f.write(taskres[key])\n f.close\n else:\n # check if this subtask model should be saved\n save,score = Best_model(taskres['model'],taskres['LR'],taskres['task'],taskres['seed'])\n if save:\n # get model from plasma and store locally\n time_start = time.time()\n plasobj = taskres[key]\n modelbin = ray.get(plasobj)\n del (plasobj)\n time_pull = time.time()-time_start\n st = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')\n logger.info(f\"{st} eval={score}, model pull took {time_pull:.1f}s for length={len(modelbin)}\")\n f = open(outfolder+'/'+key, \"wb\")\n f.write(modelbin)\n f.close\n\n# Example run command:\n#./glue_benchmark.py -b codeflare-ai -m roberta-base -t WNLI -M -s 40 41 42 43"
}