Skip to content

Commit 4307667

Browse files
author
Alex
committed
Updates function description in DP. Fixed typos in MC. Changed Lambda to Gamma as in the book.
1 parent 60013e5 commit 4307667

12 files changed

+133
-153
lines changed

DP/Policy Evaluation Solution.ipynb

+12-6
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,9 @@
33
{
44
"cell_type": "code",
55
"execution_count": 53,
6-
"metadata": {},
6+
"metadata": {
7+
"collapsed": true
8+
},
79
"outputs": [],
810
"source": [
911
"import numpy as np\n",
@@ -42,10 +44,10 @@
4244
" policy: [S, A] shaped matrix representing the policy.\n",
4345
" env: OpenAI env. env.P represents the transition probabilities of the environment.\n",
4446
" env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).\n",
45-
" env.nS is a number of available states. \n",
46-
" env.nA is a number of available actions.\n",
47+
" env.nS is a number of states in the environment. \n",
48+
" env.nA is a number of actions in the environment.\n",
4749
" theta: We stop evaluation once our value function change is less than theta for all states.\n",
48-
" discount_factor: gamma discount factor.\n",
50+
" discount_factor: Gamma discount factor.\n",
4951
" \n",
5052
" Returns:\n",
5153
" Vector of length env.nS representing the value function.\n",
@@ -75,7 +77,9 @@
7577
{
7678
"cell_type": "code",
7779
"execution_count": 56,
78-
"metadata": {},
80+
"metadata": {
81+
"collapsed": true
82+
},
7983
"outputs": [],
8084
"source": [
8185
"random_policy = np.ones([env.nS, env.nA]) / env.nA\n",
@@ -118,7 +122,9 @@
118122
{
119123
"cell_type": "code",
120124
"execution_count": 51,
121-
"metadata": {},
125+
"metadata": {
126+
"collapsed": true
127+
},
122128
"outputs": [],
123129
"source": [
124130
"# Test: Make sure the evaluated policy is what we expected\n",

DP/Policy Evaluation.ipynb

+12-6
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,9 @@
33
{
44
"cell_type": "code",
55
"execution_count": 23,
6-
"metadata": {},
6+
"metadata": {
7+
"collapsed": true
8+
},
79
"outputs": [],
810
"source": [
911
"import numpy as np\n",
@@ -27,7 +29,9 @@
2729
{
2830
"cell_type": "code",
2931
"execution_count": 25,
30-
"metadata": {},
32+
"metadata": {
33+
"collapsed": true
34+
},
3135
"outputs": [],
3236
"source": [
3337
"def policy_eval(policy, env, discount_factor=1.0, theta=0.00001):\n",
@@ -38,10 +42,10 @@
3842
" policy: [S, A] shaped matrix representing the policy.\n",
3943
" env: OpenAI env. env.P represents the transition probabilities of the environment.\n",
4044
" env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).\n",
41-
" env.nS is a number of available states. \n",
42-
" env.nA is a number of available actions.\n",
45+
" env.nS is a number of states in the environment. \n",
46+
" env.nA is a number of actions in the environment.\n",
4347
" theta: We stop evaluation once our value function change is less than theta for all states.\n",
44-
" discount_factor: gamma discount factor.\n",
48+
" discount_factor: Gamma discount factor.\n",
4549
" \n",
4650
" Returns:\n",
4751
" Vector of length env.nS representing the value function.\n",
@@ -57,7 +61,9 @@
5761
{
5862
"cell_type": "code",
5963
"execution_count": 26,
60-
"metadata": {},
64+
"metadata": {
65+
"collapsed": true
66+
},
6167
"outputs": [],
6268
"source": [
6369
"random_policy = np.ones([env.nS, env.nA]) / env.nA\n",

DP/Policy Iteration Solution.ipynb

+9-5
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,9 @@
33
{
44
"cell_type": "code",
55
"execution_count": 1,
6-
"metadata": {},
6+
"metadata": {
7+
"collapsed": true
8+
},
79
"outputs": [],
810
"source": [
911
"import numpy as np\n",
@@ -44,10 +46,10 @@
4446
" policy: [S, A] shaped matrix representing the policy.\n",
4547
" env: OpenAI env. env.P represents the transition probabilities of the environment.\n",
4648
" env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).\n",
47-
" env.nS is a number of available states. \n",
48-
" env.nA is a number of available actions.\n",
49+
" env.nS is a number of states in the environment. \n",
50+
" env.nA is a number of actions in the environment.\n",
4951
" theta: We stop evaluation once our value function change is less than theta for all states.\n",
50-
" discount_factor: gamma discount factor.\n",
52+
" discount_factor: Gamma discount factor.\n",
5153
" \n",
5254
" Returns:\n",
5355
" Vector of length env.nS representing the value function.\n",
@@ -201,7 +203,9 @@
201203
{
202204
"cell_type": "code",
203205
"execution_count": 59,
204-
"metadata": {},
206+
"metadata": {
207+
"collapsed": true
208+
},
205209
"outputs": [],
206210
"source": [
207211
"# Test the value function\n",

DP/Policy Iteration.ipynb

+9-5
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,9 @@
33
{
44
"cell_type": "code",
55
"execution_count": 5,
6-
"metadata": {},
6+
"metadata": {
7+
"collapsed": true
8+
},
79
"outputs": [],
810
"source": [
911
"import numpy as np\n",
@@ -44,10 +46,10 @@
4446
" policy: [S, A] shaped matrix representing the policy.\n",
4547
" env: OpenAI env. env.P represents the transition probabilities of the environment.\n",
4648
" env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).\n",
47-
" env.nS is a number of available states. \n",
48-
" env.nA is a number of available actions.\n",
49+
" env.nS is a number of states in the environment. \n",
50+
" env.nA is a number of actions in the environment.\n",
4951
" theta: We stop evaluation once our value function change is less than theta for all states.\n",
50-
" discount_factor: gamma discount factor.\n",
52+
" discount_factor: Gamma discount factor.\n",
5153
" \n",
5254
" Returns:\n",
5355
" Vector of length env.nS representing the value function.\n",
@@ -77,7 +79,9 @@
7779
{
7880
"cell_type": "code",
7981
"execution_count": 13,
80-
"metadata": {},
82+
"metadata": {
83+
"collapsed": true
84+
},
8185
"outputs": [],
8286
"source": [
8387
"def policy_improvement(env, policy_eval_fn=policy_eval, discount_factor=1.0):\n",

DP/Value Iteration Solution.ipynb

+12-6
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,9 @@
33
{
44
"cell_type": "code",
55
"execution_count": 17,
6-
"metadata": {},
6+
"metadata": {
7+
"collapsed": true
8+
},
79
"outputs": [],
810
"source": [
911
"import numpy as np\n",
@@ -17,7 +19,9 @@
1719
{
1820
"cell_type": "code",
1921
"execution_count": 18,
20-
"metadata": {},
22+
"metadata": {
23+
"collapsed": true
24+
},
2125
"outputs": [],
2226
"source": [
2327
"pp = pprint.PrettyPrinter(indent=2)\n",
@@ -27,7 +31,9 @@
2731
{
2832
"cell_type": "code",
2933
"execution_count": 19,
30-
"metadata": {},
34+
"metadata": {
35+
"collapsed": true
36+
},
3137
"outputs": [],
3238
"source": [
3339
"def value_iteration(env, theta=0.0001, discount_factor=1.0):\n",
@@ -37,10 +43,10 @@
3743
" Args:\n",
3844
" env: OpenAI env. env.P represents the transition probabilities of the environment.\n",
3945
" env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).\n",
40-
" env.nS is a number of available states. \n",
41-
" env.nA is a number of available actions.\n",
46+
" env.nS is a number of states in the environment. \n",
47+
" env.nA is a number of actions in the environment.\n",
4248
" theta: We stop evaluation once our value function change is less than theta for all states.\n",
43-
" discount_factor: gamma discount factor.\n",
49+
" discount_factor: Gamma discount factor.\n",
4450
" \n",
4551
" Returns:\n",
4652
" A tuple (policy, V) of the optimal policy and the optimal value function.\n",

DP/Value Iteration.ipynb

+12-6
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,9 @@
33
{
44
"cell_type": "code",
55
"execution_count": 3,
6-
"metadata": {},
6+
"metadata": {
7+
"collapsed": true
8+
},
79
"outputs": [],
810
"source": [
911
"import numpy as np\n",
@@ -17,7 +19,9 @@
1719
{
1820
"cell_type": "code",
1921
"execution_count": 4,
20-
"metadata": {},
22+
"metadata": {
23+
"collapsed": true
24+
},
2125
"outputs": [],
2226
"source": [
2327
"pp = pprint.PrettyPrinter(indent=2)\n",
@@ -27,7 +31,9 @@
2731
{
2832
"cell_type": "code",
2933
"execution_count": 5,
30-
"metadata": {},
34+
"metadata": {
35+
"collapsed": true
36+
},
3137
"outputs": [],
3238
"source": [
3339
"def value_iteration(env, theta=0.0001, discount_factor=1.0):\n",
@@ -37,10 +43,10 @@
3743
" Args:\n",
3844
" env: OpenAI env. env.P represents the transition probabilities of the environment.\n",
3945
" env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).\n",
40-
" env.nS is a number of available states. \n",
41-
" env.nA is a number of available actions.\n",
46+
" env.nS is a number of states in the environment. \n",
47+
" env.nA is a number of actions in the environment.\n",
4248
" theta: We stop evaluation once our value function change is less than theta for all states.\n",
43-
" discount_factor: gamma discount factor.\n",
49+
" discount_factor: Gamma discount factor.\n",
4450
" \n",
4551
" Returns:\n",
4652
" A tuple (policy, V) of the optimal policy and the optimal value function. \n",

MC/MC Control with Epsilon-Greedy Policies Solution.ipynb

+12-18
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,7 @@
2727
{
2828
"cell_type": "code",
2929
"execution_count": 2,
30-
"metadata": {
31-
"collapsed": false
32-
},
30+
"metadata": {},
3331
"outputs": [],
3432
"source": [
3533
"env = BlackjackEnv()"
@@ -81,14 +79,14 @@
8179
" \n",
8280
" Args:\n",
8381
" env: OpenAI gym environment.\n",
84-
" num_episodes: Nubmer of episodes to sample.\n",
85-
" discount_factor: Lambda discount factor.\n",
82+
" num_episodes: Number of episodes to sample.\n",
83+
" discount_factor: Gamma discount factor.\n",
8684
" epsilon: Chance the sample a random action. Float betwen 0 and 1.\n",
8785
" \n",
8886
" Returns:\n",
8987
" A tuple (Q, policy).\n",
9088
" Q is a dictionary mapping state -> action values.\n",
91-
" policy is a function taht takes an observation as an argument and returns\n",
89+
" policy is a function that takes an observation as an argument and returns\n",
9290
" action probabilities\n",
9391
" \"\"\"\n",
9492
" \n",
@@ -147,9 +145,7 @@
147145
{
148146
"cell_type": "code",
149147
"execution_count": 5,
150-
"metadata": {
151-
"collapsed": false
152-
},
148+
"metadata": {},
153149
"outputs": [
154150
{
155151
"name": "stdout",
@@ -166,9 +162,7 @@
166162
{
167163
"cell_type": "code",
168164
"execution_count": 6,
169-
"metadata": {
170-
"collapsed": false
171-
},
165+
"metadata": {},
172166
"outputs": [
173167
{
174168
"data": {
@@ -213,23 +207,23 @@
213207
],
214208
"metadata": {
215209
"kernelspec": {
216-
"display_name": "Python 3",
210+
"display_name": "Python 2",
217211
"language": "python",
218-
"name": "python3"
212+
"name": "python2"
219213
},
220214
"language_info": {
221215
"codemirror_mode": {
222216
"name": "ipython",
223-
"version": 3
217+
"version": 2
224218
},
225219
"file_extension": ".py",
226220
"mimetype": "text/x-python",
227221
"name": "python",
228222
"nbconvert_exporter": "python",
229-
"pygments_lexer": "ipython3",
230-
"version": "3.5.2"
223+
"pygments_lexer": "ipython2",
224+
"version": "2.7.12"
231225
}
232226
},
233227
"nbformat": 4,
234-
"nbformat_minor": 0
228+
"nbformat_minor": 1
235229
}

0 commit comments

Comments
 (0)