Skip to content

Commit ab1888d

Browse files
author
Thomas Mulc
committed
changed names of scripts. ADAG had problem with having local variables on ps, so that is now fixed.
1 parent f824dff commit ab1888d

File tree

5 files changed

+50
-258
lines changed

5 files changed

+50
-258
lines changed

ADAG/dist_cpu_sing_mach_sync.py ADAG/ADAG.py

+44-17
Original file line numberDiff line numberDiff line change
@@ -34,16 +34,17 @@ def main():
3434
# Graph
3535
# We must not use train.replicate_device_setter for normal operations
3636
# with tf.device(tf.train.replica_device_setter(ps_tasks=n_pss\
37-
# ,worker_device="/job:%s/task:%d/cpu:0" % (FLAGS.job_name,FLAGS.task_index))):
38-
with tf.device(tf.train.replica_device_setter(ps_tasks=n_pss\
39-
,worker_device="/job:%s/task:%d" % (FLAGS.job_name,FLAGS.task_index))):
37+
# ,worker_device="/job:%s/task:%d" % (FLAGS.job_name,FLAGS.task_index))):
4038

39+
#Local operations
40+
with tf.device("/job:worker/replica:0/task:%d" % FLAGS.task_index):
4141
a = tf.Variable(tf.constant(0.,shape=[2]),dtype=tf.float32,
4242
collections=[tf.GraphKeys.LOCAL_VARIABLES])
4343
b = tf.Variable(tf.constant(0.,shape=[2]),dtype=tf.float32,
4444
collections=[tf.GraphKeys.LOCAL_VARIABLES])
4545
c=a+b
46-
46+
with tf.device(tf.train.replica_device_setter(ps_tasks=n_pss\
47+
,worker_device="/job:%s/task:%d" % (FLAGS.job_name,FLAGS.task_index))):
4748
local_step = tf.Variable(0,dtype=tf.int32,trainable=False,name='local_step',
4849
collections=['local_non_trainable'])
4950
#global step is tricky
@@ -54,19 +55,11 @@ def main():
5455
# all workers use the same learning rate and it is decided on by the task 0
5556
# or maybe the from the graph of the chief worker
5657
base_lr = .0001
57-
loptimizer = tf.train.GradientDescentOptimizer(base_lr)
58+
loptimizer = tf.train.GradientDescentOptimizer(base_lr) #local optimizer
5859
optimizer = tf.train.GradientDescentOptimizer(base_lr) #the learning rate set here is global
5960

6061
#create global variables and/or references
6162
local_to_global, global_to_local = create_global_variables()
62-
63-
#local optimizers and steps
64-
# actually only need one optimizer
65-
# optimizers=[]
66-
# local_steps = []
67-
# for w in range(n_workers):
68-
# local_steps.append(tf.Variable(0,dtype=tf.int32,trainable=False,name='local_step_%d'%w))
69-
# optimizers.append(tf.train.GradientDescentOptimizer(base_lr))
7063

7164
# ADAG (simplest case since all batches are the same)
7265
update_window = 3 # T: update/communication window, a.k.a number of gradients to use before sending to ps
@@ -86,14 +79,20 @@ def main():
8679
zip(grads,[ local_to_global[v] for v in varss])
8780
,global_step=global_step) #apply the gradients to variables on ps
8881

82+
# Push to global server
8983
with tf.control_dependencies([opt]):
9084
assign_locals = assign_global_to_local(global_to_local)
9185

9286
# Init ops
9387
init_local = tf.variables_initializer(tf.local_variables()+tf.get_collection('local_non_trainable'))#tf.local_variables_initializer() #for local variables
9488
init = tf.global_variables_initializer() # for global variables
9589

90+
# TODO: Grab global state before training so all workers have same initialization
91+
grab_global_init = assign_global_to_local(global_to_local)
92+
9693
# TODO: Add op the assigns local values to global ones for chief to execute
94+
assign_global = assign_local_to_global(local_to_global)
95+
9796

9897

9998
# Session
@@ -105,10 +104,12 @@ def main():
105104
sess = tf.train.MonitoredTrainingSession(master = server.target,is_chief=is_chief,config=config,
106105
scaffold=scaff,hooks=hooks,save_checkpoint_secs=1,checkpoint_dir='logdir')
107106
if is_chief:
107+
sess.run(assign_global) #TODO #assigns chiefs initial values to ps
108108
time.sleep(10) #grace period to wait on other workers before starting training
109109

110110
# Train until hook stops session
111111
print('Starting training on worker %d'%FLAGS.task_index)
112+
sess.run(grab_global_init)
112113
while not sess.should_stop():
113114
_,_,r,gs,ls = sess.run([opt,assign_locals,c,global_step,local_step])
114115

@@ -127,14 +128,40 @@ def main():
127128
print('Session from worker %d closed cleanly'%FLAGS.task_index)
128129

129130
def assign_global_to_local(global_to_local):
131+
"""
132+
global_to_local : dictionary with corresponding local variable for global key
133+
134+
Assigns global variable value to local variables
135+
"""
130136
for v in global_to_local.keys():
131137
tf.assign(global_to_local[v],v)
132138
return tf.no_op()
133139

140+
def assign_local_to_global(local_to_global):
141+
"""
142+
local_to_global : dictionary with corresponding global variable for local key
143+
144+
Assigns global variable value to local variables
145+
"""
146+
for v in local_to_global.keys():
147+
tf.assign(local_to_global[v],v)
148+
return tf.no_op()
149+
134150
def get_global_variable_by_name(name):
151+
"""
152+
name : the name of the global variable
153+
154+
Returns the global variable of given name
155+
"""
135156
return [v for v in tf.global_variables() if v.name == name][0]
136157

137158
def create_global_variables():
159+
"""
160+
Creates global variables for local variables on the graph.
161+
162+
Returns dictionarys for local-to-global and global-to-local
163+
variable mappings.
164+
"""
138165
# TODO: swap static string with tf.train.replica_device_setter(ps_tasks=n_pss)
139166
local_to_global = {}
140167
global_to_local = {}
@@ -149,10 +176,10 @@ def create_global_variables():
149176
global_to_local[v_g] = v
150177
return local_to_global,global_to_local
151178

152-
# TODO: initialize global ps variables
153-
# according to the chiefs initial values
154-
def assign_global_values():
155-
return None
179+
# # TODO: initialize global ps variables
180+
# # according to the chiefs initial values
181+
# def assign_global_values():
182+
# return None
156183

157184

158185
if __name__ == '__main__':

ADAG/run.sh

+3-3
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
#!/bin/bash
2-
python dist_cpu_sing_mach_sync.py --job_name "ps" --task_index 0 &
3-
python dist_cpu_sing_mach_sync.py --job_name "worker" --task_index 0 &
4-
python dist_cpu_sing_mach_sync.py --job_name "worker" --task_index 1 &
2+
python ADAG.py --job_name "ps" --task_index 0 &
3+
python ADAG.py --job_name "worker" --task_index 0 &
4+
python ADAG.py --job_name "worker" --task_index 1 &
File renamed without changes.

Hogwild/run.sh

+3-3
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
#!/bin/bash
2-
python dist_setup.py --job_name "ps" --task_index 0 &
3-
python dist_setup.py --job_name "worker" --task_index 0 &
4-
python dist_setup.py --job_name "worker" --task_index 1 &
2+
python Hogwild.py --job_name "ps" --task_index 0 &
3+
python Hogwild.py --job_name "worker" --task_index 0 &
4+
python Hogwild.py --job_name "worker" --task_index 1 &

Local and Global Variables.ipynb

-235
This file was deleted.

0 commit comments

Comments
 (0)