Skip to content

Commit f836653

Browse files
Added mnist_fashion as default job demo
1 parent a9b314e commit f836653

File tree

4 files changed

+93
-6
lines changed

4 files changed

+93
-6
lines changed

demo-notebooks/guided-demos/1_cluster_job_client.ipynb

+3-2
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,8 @@
5050
" max_cpus=1,\n",
5151
" min_memory=4,\n",
5252
" max_memory=4,\n",
53-
" num_gpus=0,\n",
53+
" num_gpus=1,\n",
54+
" head_gpus=1,\n",
5455
" image=\"quay.io/project-codeflare/ray:latest-py39-cu118\",\n",
5556
" write_to_file=False, # When enabled Ray Cluster yaml files are written to /HOME/.codeflare/resources \n",
5657
" # local_queue=\"local-queue-name\" # Specify the local queue manually\n",
@@ -114,7 +115,7 @@
114115
"source": [
115116
"# Submit an example mnist job using the Job Submission Client\n",
116117
"submission_id = client.submit_job(\n",
117-
" entrypoint=\"python mnist.py\",\n",
118+
" entrypoint=\"python mnist_fashion.py\",\n",
118119
" runtime_env={\"working_dir\": \"./\",\"pip\": \"requirements.txt\"},\n",
119120
")\n",
120121
"print(submission_id)"
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
import torch
2+
import torch.nn as nn
3+
import ray
4+
from torch.utils.data import DataLoader
5+
from torchvision import datasets
6+
from torchvision.transforms import ToTensor
7+
from ray.train.torch import TorchTrainer
8+
from ray.train import ScalingConfig
9+
10+
11+
def get_dataset():
12+
return datasets.FashionMNIST(
13+
root="/tmp/data",
14+
train=True,
15+
download=True,
16+
transform=ToTensor(),
17+
)
18+
19+
20+
class NeuralNetwork(nn.Module):
21+
def __init__(self):
22+
super().__init__()
23+
self.flatten = nn.Flatten()
24+
self.linear_relu_stack = nn.Sequential(
25+
nn.Linear(28 * 28, 512),
26+
nn.ReLU(),
27+
nn.Linear(512, 512),
28+
nn.ReLU(),
29+
nn.Linear(512, 10),
30+
)
31+
32+
def forward(self, inputs):
33+
inputs = self.flatten(inputs)
34+
logits = self.linear_relu_stack(inputs)
35+
return logits
36+
37+
38+
def get_dataset():
39+
return datasets.FashionMNIST(
40+
root="/tmp/data",
41+
train=True,
42+
download=True,
43+
transform=ToTensor(),
44+
)
45+
46+
47+
def train_func_distributed():
48+
num_epochs = 3
49+
batch_size = 64
50+
51+
dataset = get_dataset()
52+
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
53+
dataloader = ray.train.torch.prepare_data_loader(dataloader)
54+
55+
model = NeuralNetwork()
56+
model = ray.train.torch.prepare_model(model)
57+
58+
criterion = nn.CrossEntropyLoss()
59+
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
60+
61+
for epoch in range(num_epochs):
62+
if ray.train.get_context().get_world_size() > 1:
63+
dataloader.sampler.set_epoch(epoch)
64+
65+
for inputs, labels in dataloader:
66+
optimizer.zero_grad()
67+
pred = model(inputs)
68+
loss = criterion(pred, labels)
69+
loss.backward()
70+
optimizer.step()
71+
print(f"epoch: {epoch}, loss: {loss.item()}")
72+
73+
74+
# For GPU Training, set `use_gpu` to True.
75+
use_gpu = True
76+
77+
trainer = TorchTrainer(
78+
train_func_distributed,
79+
scaling_config=ScalingConfig(
80+
num_workers=3, use_gpu=use_gpu
81+
), # num_workers = number of worker nodes with the ray head node included
82+
)
83+
84+
results = trainer.fit()

demo-notebooks/guided-demos/notebook-ex-outputs/1_cluster_job_client.ipynb

+3-2
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,8 @@
5050
" max_cpus=1,\n",
5151
" min_memory=4,\n",
5252
" max_memory=4,\n",
53-
" num_gpus=0,\n",
53+
" num_gpus=1,\n",
54+
" head_gpus=1,\n",
5455
" image=\"quay.io/project-codeflare/ray:latest-py39-cu118\",\n",
5556
" write_to_file=False, # When enabled Ray Cluster yaml files are written to /HOME/.codeflare/resources \n",
5657
" # local_queue=\"local-queue-name\" # Specify the local queue manually\n",
@@ -114,7 +115,7 @@
114115
"source": [
115116
"# Submit an example mnist job using the Job Submission Client\n",
116117
"submission_id = client.submit_job(\n",
117-
" entrypoint=\"python mnist.py\",\n",
118+
" entrypoint=\"python mnist_fashion.py\",\n",
118119
" runtime_env={\"working_dir\": \"./\",\"pip\": \"requirements.txt\"},\n",
119120
")\n",
120121
"print(submission_id)"

demo-notebooks/guided-demos/preview_nbs/1_cluster_job_client.ipynb

+3-2
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,8 @@
5050
" max_cpus=1,\n",
5151
" min_memory=4,\n",
5252
" max_memory=4,\n",
53-
" num_gpus=0,\n",
53+
" num_gpus=1,\n",
54+
" head_gpus=1,\n",
5455
" image=\"quay.io/project-codeflare/ray:latest-py39-cu118\",\n",
5556
" write_to_file=False, # When enabled Ray Cluster yaml files are written to /HOME/.codeflare/resources\n",
5657
" # local_queue=\"local-queue-name\" # Specify the local queue manually\n",
@@ -114,7 +115,7 @@
114115
"source": [
115116
"# Submit an example mnist job using the Job Submission Client\n",
116117
"submission_id = client.submit_job(\n",
117-
" entrypoint=\"python mnist.py\",\n",
118+
" entrypoint=\"python mnist_fashion.py\",\n",
118119
" runtime_env={\"working_dir\": \"./\",\"pip\": \"requirements.txt\"},\n",
119120
")\n",
120121
"print(submission_id)"

0 commit comments

Comments
 (0)