-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathtrain_pod.yml
More file actions
138 lines (126 loc) · 3.92 KB
/
train_pod.yml
File metadata and controls
138 lines (126 loc) · 3.92 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
# apiVersion: v1
# kind: Pod
# metadata:
# name: reflexion-train-pod
# spec:
# nodeSelector:
# topology.kubernetes.io/region: us-west
# nautilus.io/linstor: "true"
# containers:
# - name: reflexion-trainer
# image: gitlab-registry.nrp-nautilus.io/prp/jupyter-stack/prp
# env:
# - name: REPO_PATH
# value: /app/pytorch-CycleGAN-and-pix2pix
# command:
# - "bash"
# - "-c"
# args:
# - |
# # Install decompression tools (matching data pod)
# sudo apt-get update && sudo apt-get install -y pigz pv
# # Clone repository
# git clone https://github.com/junyanz/pytorch-CycleGAN-and-pix2pix.git ${REPO_PATH}
# cd ${REPO_PATH}
# # Install requirements
# pip install -r requirements.txt
# # Extract compressed data (matching data pod format)
# LATEST_DATASET=$(ls -t /data/processed_data_*.tar.gz 2>/dev/null | head -1)
# if [ -n "$LATEST_DATASET" ]; then
# echo "Found dataset: $LATEST_DATASET"
# pv "$LATEST_DATASET" | pigz -d | tar -xf - -C ${REPO_PATH}/datasets/
# echo "Dataset extracted to ${REPO_PATH}/datasets/"
# else
# echo "No datasets found - run data_pod.yml first"
# fi
# echo "Training environment ready"
# python train.py --dataroot ${REPO_PATH}/datasets/horse2zebra --name test_run --model cycle_gan --n_epochs 5
# sleep infinity
# volumeMounts:
# - name: git-repo
# mountPath: /app
# - name: data-volume
# mountPath: /data
# - name:
# dshm
# # Shared memory volume for PyTorch DataLoader multiprocessing
# # PyTorch workers need space to transfer loaded batches to main process
# mountPath: /dev/shm
# resources:
# limits:
# memory: 24Gi
# cpu: "12"
# nvidia.com/gpu: "1"
# requests:
# memory: 20Gi
# cpu: "10"
# nvidia.com/gpu: "1"
# volumes:
# - name: git-repo
# emptyDir: {}
# - name: data-volume
# persistentVolumeClaim:
# claimName: cyclegan-data-pvc
# - name: dshm
# emptyDir:
# medium: Memory
# sizeLimit: 8Gi
# restartPolicy: Never
apiVersion: v1
kind: Pod
metadata:
name: reflexion-test-pod
namespace: kc-ai-research-lab
spec:
nodeSelector:
topology.kubernetes.io/region: us-west
nautilus.io/linstor: "true"
containers:
- name: reflexion-trainer
image: ghcr.io/amit502/reflexion:latest
env:
- name: REPO_PATH
value: /app/reflexion
- name: RESULTS_PATH
value: /results
- name: OPENAI_API_KEY
valueFrom:
secretKeyRef:
name: openai-secret
key: api-key
command:
- "bash"
- "-c"
args:
- |
# Clone your repository
git clone https://github.com/amit502/reflexion.git ${REPO_PATH}
cd ${REPO_PATH}
# ← Checkout a specific branch (see note below)
git fetch origin retrieval-struc-emphasis
git checkout retrieval-struc-emphasis
pip install -r hotpotqa_runs/requirements.txt
cd hotpotqa_runs/notebooks
echo "Environment ready for testing!"
echo "Exec into pod with: kubectl exec -it reflexion-test-pod -- bash"
# Keep pod alive for interactive testing
sleep infinity
volumeMounts:
- name: git-repo
mountPath: /app
- name: results-volume
mountPath: /results
resources:
limits:
memory: 2Gi
cpu: "1"
requests:
memory: 2Gi
cpu: "1"
volumes:
- name: git-repo
emptyDir: {}
- name: results-volume
persistentVolumeClaim:
claimName: reflexion-data-pvc
restartPolicy: Never