Skip to content

Commit 465d9c4

Browse files
committed
remove: ddp
1 parent 403cca6 commit 465d9c4

8 files changed

+9
-534
lines changed

src/codeflare_sdk/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,6 @@
1414
get_cluster,
1515
)
1616

17-
from .job import JobDefinition, Job, DDPJobDefinition, DDPJob, RayJobClient
17+
from .job import JobDefinition, Job, RayJobClient
1818

1919
from .utils import generate_cert

src/codeflare_sdk/job/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
1-
from .jobs import JobDefinition, Job, DDPJobDefinition, DDPJob
1+
from .jobs import JobDefinition, Job
22

33
from .ray_jobs import RayJobClient

src/codeflare_sdk/job/jobs.py

Lines changed: 0 additions & 163 deletions
Original file line numberDiff line numberDiff line change
@@ -12,15 +12,10 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
"""
16-
The jobs sub-module contains methods needed to submit Distributed Data Parallel(DDP) jobs to Ray Clusters created by the CodeFlare SDK.
17-
"""
18-
1915
import abc
2016
from typing import TYPE_CHECKING, Optional, Dict, List
2117
from pathlib import Path
2218

23-
from torchx.components.dist import ddp
2419
from torchx.runner import get_runner, Runner
2520
from torchx.schedulers.ray_scheduler import RayScheduler
2621
from torchx.specs import AppHandle, parse_app_handle, AppDryRunInfo
@@ -47,161 +42,3 @@ def status(self):
4742

4843
def logs(self):
4944
pass
50-
51-
52-
class DDPJobDefinition(JobDefinition):
53-
def __init__(
54-
self,
55-
script: Optional[str] = None,
56-
m: Optional[str] = None,
57-
script_args: Optional[List[str]] = None,
58-
name: Optional[str] = None,
59-
cpu: Optional[int] = None,
60-
gpu: Optional[int] = None,
61-
memMB: Optional[int] = None,
62-
h: Optional[str] = None,
63-
j: Optional[str] = None,
64-
env: Optional[Dict[str, str]] = None,
65-
max_retries: int = 0,
66-
mounts: Optional[List[str]] = None,
67-
rdzv_port: int = 29500,
68-
rdzv_backend: str = None,
69-
scheduler_args: Optional[Dict[str, str]] = None,
70-
image: Optional[str] = None,
71-
workspace: Optional[str] = f"file://{Path.cwd()}",
72-
):
73-
if bool(script) == bool(m): # logical XOR
74-
raise ValueError(
75-
"Exactly one of the following arguments must be defined: [script, m]."
76-
)
77-
self.script = script
78-
self.m = m
79-
self.script_args: List[str] = script_args if script_args is not None else []
80-
self.name = name
81-
self.cpu = cpu
82-
self.gpu = gpu
83-
self.memMB = memMB
84-
self.h = h
85-
self.j = j
86-
self.env: Dict[str, str] = env if env is not None else dict()
87-
self.max_retries = max_retries
88-
self.mounts: List[str] = mounts if mounts is not None else []
89-
self.rdzv_port = rdzv_port
90-
self.rdzv_backend = rdzv_backend
91-
self.scheduler_args: Dict[str, str] = (
92-
scheduler_args if scheduler_args is not None else dict()
93-
)
94-
self.image = image
95-
self.workspace = workspace
96-
97-
def _dry_run(self, cluster: "Cluster"):
98-
j = f"{cluster.config.num_workers}x{max(cluster.config.num_gpus, 1)}" # # of proc. = # of gpus
99-
runner = get_runner(ray_client=cluster.job_client)
100-
runner._scheduler_instances["ray"] = RayScheduler(
101-
session_name=runner._name, ray_client=cluster.job_client
102-
)
103-
return (
104-
runner.dryrun(
105-
app=ddp(
106-
*self.script_args,
107-
script=self.script,
108-
m=self.m,
109-
name=self.name,
110-
h=self.h,
111-
cpu=self.cpu if self.cpu is not None else cluster.config.max_cpus,
112-
gpu=self.gpu if self.gpu is not None else cluster.config.num_gpus,
113-
memMB=self.memMB
114-
if self.memMB is not None
115-
else cluster.config.max_memory * 1024,
116-
j=self.j if self.j is not None else j,
117-
env=self.env,
118-
max_retries=self.max_retries,
119-
rdzv_port=self.rdzv_port,
120-
rdzv_backend=self.rdzv_backend
121-
if self.rdzv_backend is not None
122-
else "static",
123-
mounts=self.mounts,
124-
),
125-
scheduler=cluster.torchx_scheduler,
126-
cfg=cluster.torchx_config(**self.scheduler_args),
127-
workspace=self.workspace,
128-
),
129-
runner,
130-
)
131-
132-
def _missing_spec(self, spec: str):
133-
raise ValueError(f"Job definition missing arg: {spec}")
134-
135-
def _dry_run_no_cluster(self):
136-
if self.scheduler_args is not None:
137-
if self.scheduler_args.get("namespace") is None:
138-
self.scheduler_args["namespace"] = get_current_namespace()
139-
runner = get_runner()
140-
return (
141-
runner.dryrun(
142-
app=ddp(
143-
*self.script_args,
144-
script=self.script,
145-
m=self.m,
146-
name=self.name
147-
if self.name is not None
148-
else self._missing_spec("name"),
149-
h=self.h,
150-
cpu=self.cpu
151-
if self.cpu is not None
152-
else self._missing_spec("cpu (# cpus per worker)"),
153-
gpu=self.gpu
154-
if self.gpu is not None
155-
else self._missing_spec("gpu (# gpus per worker)"),
156-
memMB=self.memMB
157-
if self.memMB is not None
158-
else self._missing_spec("memMB (memory in MB)"),
159-
j=self.j
160-
if self.j is not None
161-
else self._missing_spec(
162-
"j (`workers`x`procs`)"
163-
), # # of proc. = # of gpus,
164-
env=self.env, # should this still exist?
165-
max_retries=self.max_retries,
166-
rdzv_port=self.rdzv_port, # should this still exist?
167-
rdzv_backend=self.rdzv_backend
168-
if self.rdzv_backend is not None
169-
else "c10d",
170-
mounts=self.mounts,
171-
image=self.image
172-
if self.image is not None
173-
else self._missing_spec("image"),
174-
),
175-
scheduler="kubernetes_mcad",
176-
cfg=self.scheduler_args,
177-
workspace="",
178-
),
179-
runner,
180-
)
181-
182-
def submit(self, cluster: "Cluster" = None) -> "Job":
183-
return DDPJob(self, cluster)
184-
185-
186-
class DDPJob(Job):
187-
def __init__(self, job_definition: "DDPJobDefinition", cluster: "Cluster" = None):
188-
self.job_definition = job_definition
189-
self.cluster = cluster
190-
if self.cluster:
191-
definition, runner = job_definition._dry_run(cluster)
192-
self._app_handle = runner.schedule(definition)
193-
self._runner = runner
194-
else:
195-
definition, runner = job_definition._dry_run_no_cluster()
196-
self._app_handle = runner.schedule(definition)
197-
self._runner = runner
198-
all_jobs.append(self)
199-
200-
def status(self) -> str:
201-
return self._runner.status(self._app_handle)
202-
203-
def logs(self) -> str:
204-
return "".join(self._runner.log_lines(self._app_handle, None))
205-
206-
def cancel(self):
207-
self._runner.cancel(self._app_handle)

tests/e2e/mnist_raycluster_sdk_oauth_test.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from torchx.specs.api import AppState, is_terminal
66

77
from codeflare_sdk import Cluster, ClusterConfiguration, TokenAuthentication
8-
from codeflare_sdk.job.jobs import DDPJobDefinition
8+
from codeflare_sdk.job.jobs import JobDefinition
99

1010
import pytest
1111

@@ -97,7 +97,7 @@ def assert_jobsubmit_withoutLogin(self, cluster):
9797

9898
def assert_jobsubmit_withlogin(self, cluster):
9999
self.assert_appwrapper_exists()
100-
jobdef = DDPJobDefinition(
100+
jobdef = JobDefinition(
101101
name="mnist",
102102
script="./tests/e2e/mnist.py",
103103
scheduler_args={"requirements": "./tests/e2e/mnist_pip_requirements.txt"},

tests/e2e/mnist_raycluster_sdk_test.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from torchx.specs.api import AppState, is_terminal
1111

1212
from codeflare_sdk.cluster.cluster import Cluster, ClusterConfiguration
13-
from codeflare_sdk.job.jobs import DDPJobDefinition
13+
from codeflare_sdk.job.jobs import JobDefinition
1414

1515
import pytest
1616

@@ -69,6 +69,7 @@ def run_mnist_raycluster_sdk(self):
6969
num_gpus=0,
7070
instascale=False,
7171
image=ray_image,
72+
ingress_domain="apps.cluster.awsroute.org",
7273
ingress_options=ingress_options,
7374
write_to_file=True,
7475
)
@@ -86,7 +87,7 @@ def run_mnist_raycluster_sdk(self):
8687

8788
cluster.details()
8889

89-
jobdef = DDPJobDefinition(
90+
jobdef = JobDefinition(
9091
name="mnist",
9192
script="./tests/e2e/mnist.py",
9293
scheduler_args={"requirements": "./tests/e2e/mnist_pip_requirements.txt"},

tests/e2e/mnist_rayjob.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,15 @@
55
from torchx.specs.api import AppState, is_terminal
66

77
from codeflare_sdk.cluster.cluster import get_cluster
8-
from codeflare_sdk.job.jobs import DDPJobDefinition
8+
from codeflare_sdk.job.jobs import JobDefinition
99

1010
namespace = sys.argv[1]
1111

1212
cluster = get_cluster("mnist", namespace)
1313

1414
cluster.details()
1515

16-
jobdef = DDPJobDefinition(
16+
jobdef = JobDefinition(
1717
name="mnist",
1818
script="mnist.py",
1919
scheduler_args={"requirements": "requirements.txt"},

0 commit comments

Comments
 (0)