29
29
import time
30
30
31
31
import numpy as np
32
- import tensorflow as tf
33
32
34
33
import gym
34
+ import tensorflow as tf
35
35
import tensorlayer as tl
36
36
37
- ## enable eager mode
38
- tf .enable_eager_execution ()
39
-
40
-
41
- tf .logging .set_verbosity (tf .logging .DEBUG ) # enable logging
42
37
tl .logging .set_verbosity (tl .logging .DEBUG )
43
38
44
39
# hyper-parameters
52
47
render = False # display the game environment
53
48
# resume = True # load existing policy network
54
49
model_file_name = "model_pong"
55
- np .set_printoptions (threshold = np .nan )
50
+ np .set_printoptions (threshold = np .inf )
56
51
57
52
58
53
def prepro (I ):
@@ -73,35 +68,23 @@ def prepro(I):
73
68
episode_number = 0
74
69
75
70
xs , ys , rs = [], [], []
76
- # observation for training and inference
77
- # t_states = tf.placeholder(tf.float32, shape=[None, D])
78
- # policy network
79
71
72
+
73
+ # policy network
80
74
def get_model (inputs_shape ):
81
75
ni = tl .layers .Input (inputs_shape )
82
76
nn = tl .layers .Dense (n_units = H , act = tf .nn .relu , name = 'hidden' )(ni )
83
77
nn = tl .layers .Dense (n_units = 3 , name = 'output' )(nn )
84
78
M = tl .models .Model (inputs = ni , outputs = nn , name = "mlp" )
85
79
return M
80
+
81
+
86
82
model = get_model ([None , D ])
87
83
train_weights = model .trainable_weights
88
- # probs = model(t_states, is_train=True).outputs
89
- # sampling_prob = tf.nn.softmax(probs)
90
-
91
- # t_actions = tf.placeholder(tf.int32, shape=[None])
92
- # t_discount_rewards = tf.placeholder(tf.float32, shape=[None])
93
- # loss = tl.rein.cross_entropy_reward_loss(probs, t_actions, t_discount_rewards)
94
- optimizer = tf .train .RMSPropOptimizer (learning_rate , decay_rate )#.minimize(loss)
95
-
96
- # with tf.Session() as sess:
97
- # sess.run(tf.global_variables_initializer())
98
- # if resume: TODO
99
- # load_params = tl.files.load_npz(name=model_file_name+'.npz')
100
- # tl.files.assign_params(sess, load_params, network)
101
- # tl.files.load_and_assign_npz(sess, model_file_name + '.npz', network)
102
- # network.print_params()
103
- # network.print_layers()
104
- model .train () # set model to train mode (in case you add dropout into the model)
84
+
85
+ optimizer = tf .optimizers .RMSprop (lr = learning_rate , decay = decay_rate )
86
+
87
+ model .train () # set model to train mode (in case you add dropout into the model)
105
88
106
89
start_time = time .time ()
107
90
game_number = 0
@@ -114,14 +97,12 @@ def get_model(inputs_shape):
114
97
x = x .reshape (1 , D )
115
98
prev_x = cur_x
116
99
117
- # prob = sess.run(sampling_prob, feed_dict={t_states: x})
118
- _prob = model (x ).outputs
100
+ _prob = model (x )
119
101
prob = tf .nn .softmax (_prob )
120
102
121
103
# action. 1: STOP 2: UP 3: DOWN
122
- # action = np.random.choice([1,2,3], p=prob.flatten())
123
- # action = tl.rein.choice_action_by_probs(prob.flatten(), [1, 2, 3])
124
- # action = np.random.choice([1,2,3], p=prob.numpy())
104
+ # action = np.random.choice([1,2,3], p=prob.flatten())
105
+ # action = tl.rein.choice_action_by_probs(prob.flatten(), [1, 2, 3])
125
106
action = tl .rein .choice_action_by_probs (prob [0 ].numpy (), [1 , 2 , 3 ])
126
107
127
108
observation , reward , done , _ = env .step (action )
@@ -145,12 +126,8 @@ def get_model(inputs_shape):
145
126
146
127
xs , ys , rs = [], [], []
147
128
148
- # sess.run(train_op, feed_dict={t_states: epx, t_actions: epy, t_discount_rewards: disR})
149
- # t_actions = tf.placeholder(tf.int32, shape=[None])
150
- # t_discount_rewards = tf.placeholder(tf.float32, shape=[None])
151
- # loss = tl.rein.cross_entropy_reward_loss(probs, t_actions, t_discount_rewards)
152
129
with tf .GradientTape () as tape :
153
- _prob = model (epx ). outputs
130
+ _prob = model (epx )
154
131
_loss = tl .rein .cross_entropy_reward_loss (_prob , epy , disR )
155
132
grad = tape .gradient (_loss , train_weights )
156
133
optimizer .apply_gradients (zip (grad , train_weights ))
0 commit comments