icefall7
diff --git a/‎a3c_display.py
Lines changed: 5 additions & 23 deletions b/‎a3c_display.py
Lines changed: 5 additions & 23 deletions
diff --git a/‎a3c_training_thread.py
Lines changed: 20 additions & 36 deletions b/‎a3c_training_thread.py
Lines changed: 20 additions & 36 deletions
diff --git a/‎accum_trainer.py
Lines changed: 0 additions & 62 deletions b/‎accum_trainer.py
Lines changed: 0 additions & 62 deletions
diff --git a/‎accum_trainer_test.py
Lines changed: 0 additions & 89 deletions b/‎accum_trainer_test.py
Lines changed: 0 additions & 89 deletions
diff --git a/‎constants.py
Lines changed: 2 additions & 2 deletions b/‎constants.py
Lines changed: 2 additions & 2 deletions
@@ -18,19 +18,7 @@
 from constants import USE_LSTM
 
 def choose_action(pi_values):
-  values = []
-  sum = 0.0
-  for rate in pi_values:
-    sum = sum + rate
-    value = sum
-    values.append(value)
-    
-  r = random.random() * sum
-  for i in range(len(values)):
-    if values[i] >= r:
-      return i;
-  #fail safe
-  return len(values)-1
+  return np.random.choice(range(len(pi_values)), p=pi_values)  
 
 # use CPU for display tool
 device = "/cpu:0"
@@ -49,15 +37,6 @@ def choose_action(pi_values):
                               clip_norm = GRAD_NORM_CLIP,
                               device = device)
 
-# training_threads = []
-# for i in range(PARALLEL_SIZE):
-#   training_thread = A3CTrainingThread(i, global_network, 1.0,
-#                                       learning_rate_input,
-#                                       grad_applier,
-#                                       8000000,
-#                                       device = device)
-#   training_threads.append(training_thread)
-
 sess = tf.Session()
 init = tf.initialize_all_variables()
 sess.run(init)
@@ -78,5 +57,8 @@ def choose_action(pi_values):
   action = choose_action(pi_values)
   game_state.process(action)
 
-  game_state.update()
+  if game_state.terminal:
+    game_state.reset()
+  else:
+    game_state.update()
 
@@ -5,7 +5,6 @@
 import time
 import sys
 
-from accum_trainer import AccumTrainer
 from game_state import GameState
 from game_state import ACTION_SIZE
 from game_ac_network import GameACFFNetwork, GameACLSTMNetwork
@@ -39,18 +38,18 @@ def __init__(self,
 
     self.local_network.prepare_loss(ENTROPY_BETA)
 
-    # TODO: don't need accum trainer anymore with batch
-    self.trainer = AccumTrainer(device)
-    self.trainer.prepare_minimize( self.local_network.total_loss,
-                                   self.local_network.get_vars() )
-    
-    self.accum_gradients = self.trainer.accumulate_gradients()
-    self.reset_gradients = self.trainer.reset_gradients()
-  
+    with tf.device(device):
+      var_refs = [v.ref() for v in self.local_network.get_vars()]
+      self.gradients = tf.gradients(
+        self.local_network.total_loss, var_refs,
+        gate_gradients=False,
+        aggregation_method=None,
+        colocate_gradients_with_ops=False)
+
     self.apply_gradients = grad_applier.apply_gradients(
       global_network.get_vars(),
-      self.trainer.get_accum_grad_list() )
-
+      self.gradients )
+      
     self.sync = self.local_network.sync_from(global_network)
 
     self.game_state = GameState(113 * thread_index)
@@ -71,25 +70,14 @@ def _anneal_learning_rate(self, global_time_step):
     return learning_rate
 
   def choose_action(self, pi_values):
-    values = []
-    sum = 0.0
-    for rate in pi_values:
-      sum = sum + rate
-      value = sum
-      values.append(value)
-    
-    r = random.random() * sum
-    for i in range(len(values)):
-      if values[i] >= r:
-        return i;
-    #fail safe
-    return len(values)-1
+    return np.random.choice(range(len(pi_values)), p=pi_values)
 
   def _record_score(self, sess, summary_writer, summary_op, score_input, score, global_t):
     summary_str = sess.run(summary_op, feed_dict={
       score_input: score
     })
     summary_writer.add_summary(summary_str, global_t)
+    summary_writer.flush()
 
   def set_start_time(self, start_time):
     self.start_time = start_time
@@ -102,9 +90,6 @@ def process(self, sess, global_t, summary_writer, summary_op, score_input):
 
     terminal_end = False
 
-    # reset accumulated gradients
-    sess.run( self.reset_gradients )
-
     # copy weights from shared to local
     sess.run( self.sync )
 
@@ -182,33 +167,32 @@ def process(self, sess, global_t, summary_writer, summary_op, score_input):
       batch_td.append(td)
       batch_R.append(R)
 
+    cur_learning_rate = self._anneal_learning_rate(global_t)
+
     if USE_LSTM:
       batch_si.reverse()
       batch_a.reverse()
       batch_td.reverse()
       batch_R.reverse()
 
-      sess.run( self.accum_gradients,
+      sess.run( self.apply_gradients,
                 feed_dict = {
                   self.local_network.s: batch_si,
                   self.local_network.a: batch_a,
                   self.local_network.td: batch_td,
                   self.local_network.r: batch_R,
                   self.local_network.initial_lstm_state: start_lstm_state,
-                  self.local_network.step_size : [len(batch_a)] } )
+                  self.local_network.step_size : [len(batch_a)],
+                  self.learning_rate_input: cur_learning_rate } )
     else:
-      sess.run( self.accum_gradients,
+      sess.run( self.apply_gradients,
                 feed_dict = {
                   self.local_network.s: batch_si,
                   self.local_network.a: batch_a,
                   self.local_network.td: batch_td,
-                  self.local_network.r: batch_R} )
+                  self.local_network.r: batch_R,
+                  self.learning_rate_input: cur_learning_rate} )
 
-    cur_learning_rate = self._anneal_learning_rate(global_t)
-
-    sess.run( self.apply_gradients,
-              feed_dict = { self.learning_rate_input: cur_learning_rate } )
-
     if (self.thread_index == 0) and (self.local_t - self.prev_local_t >= PERFORMANCE_LOG_INTERVAL):
       self.prev_local_t += PERFORMANCE_LOG_INTERVAL
       elapsed_time = time.time() - self.start_time
 
@@ -17,5 +17,5 @@
 ENTROPY_BETA = 0.01 # entropy regurarlization constant
 MAX_TIME_STEP = 10 * 10**7
 GRAD_NORM_CLIP = 40.0 # gradient norm clipping
-USE_GPU = False # To use GPU, set True
-USE_LSTM = False # True for A3C LSTM, False for A3C FF
+USE_GPU = True # To use GPU, set True
+USE_LSTM = True # True for A3C LSTM, False for A3C FF