Sync function descriptions. Lambda -> gamma (discount factor). Added description of env.nS and env.nA

Alex · Alex · commit 60013e507080 · 2017-11-16T13:54:30.000+09:00
diff --git a/DP/Policy Evaluation Solution.ipynb b/DP/Policy Evaluation Solution.ipynb
@@ -3,9 +3,7 @@
   {
    "cell_type": "code",
    "execution_count": 53,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "import numpy as np\n",
@@ -43,9 +41,11 @@
     "    Args:\n",
     "        policy: [S, A] shaped matrix representing the policy.\n",
     "        env: OpenAI env. env.P represents the transition probabilities of the environment.\n",
-    "            env.P[s][a] is a (prob, next_state, reward, done) tuple.\n",
+    "            env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).\n",
+    "            env.nS is a number of available states. \n",
+    "            env.nA is a number of available actions.\n",
     "        theta: We stop evaluation once our value function change is less than theta for all states.\n",
-    "        discount_factor: lambda discount factor.\n",
+    "        discount_factor: gamma discount factor.\n",
     "    \n",
     "    Returns:\n",
     "        Vector of length env.nS representing the value function.\n",
@@ -75,9 +75,7 @@
   {
    "cell_type": "code",
    "execution_count": 56,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "random_policy = np.ones([env.nS, env.nA]) / env.nA\n",
@@ -87,9 +85,7 @@
   {
    "cell_type": "code",
    "execution_count": 57,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
@@ -122,9 +118,7 @@
   {
    "cell_type": "code",
    "execution_count": 51,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# Test: Make sure the evaluated policy is what we expected\n",
@@ -144,23 +138,23 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 2",
    "language": "python",
-   "name": "python3"
+   "name": "python2"
   },
   "language_info": {
    "codemirror_mode": {
     "name": "ipython",
-    "version": 3
+    "version": 2
    },
    "file_extension": ".py",
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.5.1"
+   "pygments_lexer": "ipython2",
+   "version": "2.7.12"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 0
+ "nbformat_minor": 1
 }
diff --git a/DP/Policy Evaluation.ipynb b/DP/Policy Evaluation.ipynb
@@ -3,9 +3,7 @@
   {
    "cell_type": "code",
    "execution_count": 23,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "import numpy as np\n",
@@ -29,9 +27,7 @@
   {
    "cell_type": "code",
    "execution_count": 25,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "def policy_eval(policy, env, discount_factor=1.0, theta=0.00001):\n",
@@ -42,6 +38,8 @@
     "        policy: [S, A] shaped matrix representing the policy.\n",
     "        env: OpenAI env. env.P represents the transition probabilities of the environment.\n",
     "            env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).\n",
+    "            env.nS is a number of available states. \n",
+    "            env.nA is a number of available actions.\n",
     "        theta: We stop evaluation once our value function change is less than theta for all states.\n",
     "        discount_factor: gamma discount factor.\n",
     "    \n",
@@ -59,9 +57,7 @@
   {
    "cell_type": "code",
    "execution_count": 26,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "random_policy = np.ones([env.nS, env.nA]) / env.nA\n",
@@ -71,9 +67,7 @@
   {
    "cell_type": "code",
    "execution_count": 22,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [
     {
      "ename": "AssertionError",
@@ -107,23 +101,23 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 2",
    "language": "python",
-   "name": "python3"
+   "name": "python2"
   },
   "language_info": {
    "codemirror_mode": {
     "name": "ipython",
-    "version": 3
+    "version": 2
    },
    "file_extension": ".py",
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.5.1"
+   "pygments_lexer": "ipython2",
+   "version": "2.7.12"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 0
+ "nbformat_minor": 1
 }
diff --git a/DP/Policy Iteration Solution.ipynb b/DP/Policy Iteration Solution.ipynb
@@ -3,9 +3,7 @@
   {
    "cell_type": "code",
    "execution_count": 1,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "import numpy as np\n",
@@ -45,9 +43,11 @@
     "    Args:\n",
     "        policy: [S, A] shaped matrix representing the policy.\n",
     "        env: OpenAI env. env.P represents the transition probabilities of the environment.\n",
-    "            env.P[s][a] is a (prob, next_state, reward, done) tuple.\n",
-    "        theta: We stop evaluation one our value function change is less than theta for all states.\n",
-    "        discount_factor: lambda discount factor.\n",
+    "            env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).\n",
+    "            env.nS is a number of available states. \n",
+    "            env.nA is a number of available actions.\n",
+    "        theta: We stop evaluation once our value function change is less than theta for all states.\n",
+    "        discount_factor: gamma discount factor.\n",
     "    \n",
     "    Returns:\n",
     "        Vector of length env.nS representing the value function.\n",
@@ -91,7 +91,7 @@
     "        env: The OpenAI envrionment.\n",
     "        policy_eval_fn: Policy Evaluation function that takes 3 arguments:\n",
     "            policy, env, discount_factor.\n",
-    "        discount_factor: Lambda discount factor.\n",
+    "        discount_factor: gamma discount factor.\n",
     "        \n",
     "    Returns:\n",
     "        A tuple (policy, V). \n",
@@ -136,9 +136,7 @@
   {
    "cell_type": "code",
    "execution_count": 64,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
@@ -203,9 +201,7 @@
   {
    "cell_type": "code",
    "execution_count": 59,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# Test the value function\n",
@@ -225,23 +221,23 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 2",
    "language": "python",
-   "name": "python3"
+   "name": "python2"
   },
   "language_info": {
    "codemirror_mode": {
     "name": "ipython",
-    "version": 3
+    "version": 2
    },
    "file_extension": ".py",
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.5.1"
+   "pygments_lexer": "ipython2",
+   "version": "2.7.12"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 0
+ "nbformat_minor": 1
 }
diff --git a/DP/Policy Iteration.ipynb b/DP/Policy Iteration.ipynb
@@ -3,9 +3,7 @@
   {
    "cell_type": "code",
    "execution_count": 5,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "import numpy as np\n",
@@ -45,9 +43,11 @@
     "    Args:\n",
     "        policy: [S, A] shaped matrix representing the policy.\n",
     "        env: OpenAI env. env.P represents the transition probabilities of the environment.\n",
-    "            env.P[s][a] is a (prob, next_state, reward, done) tuple.\n",
-    "        theta: We stop evaluation one our value function change is less than theta for all states.\n",
-    "        discount_factor: lambda discount factor.\n",
+    "            env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).\n",
+    "            env.nS is a number of available states. \n",
+    "            env.nA is a number of available actions.\n",
+    "        theta: We stop evaluation once our value function change is less than theta for all states.\n",
+    "        discount_factor: gamma discount factor.\n",
     "    \n",
     "    Returns:\n",
     "        Vector of length env.nS representing the value function.\n",
@@ -77,9 +77,7 @@
   {
    "cell_type": "code",
    "execution_count": 13,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "def policy_improvement(env, policy_eval_fn=policy_eval, discount_factor=1.0):\n",
@@ -91,7 +89,7 @@
     "        env: The OpenAI envrionment.\n",
     "        policy_eval_fn: Policy Evaluation function that takes 3 arguments:\n",
     "            policy, env, discount_factor.\n",
-    "        discount_factor: Lambda discount factor.\n",
+    "        discount_factor: gamma discount factor.\n",
     "        \n",
     "    Returns:\n",
     "        A tuple (policy, V). \n",
@@ -113,9 +111,7 @@
   {
    "cell_type": "code",
    "execution_count": 14,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
@@ -180,9 +176,7 @@
   {
    "cell_type": "code",
    "execution_count": 15,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [
     {
      "ename": "AssertionError",
@@ -216,23 +210,23 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 2",
    "language": "python",
-   "name": "python3"
+   "name": "python2"
   },
   "language_info": {
    "codemirror_mode": {
     "name": "ipython",
-    "version": 3
+    "version": 2
    },
    "file_extension": ".py",
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.5.1"
+   "pygments_lexer": "ipython2",
+   "version": "2.7.12"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 0
+ "nbformat_minor": 1
 }
diff --git a/DP/Value Iteration Solution.ipynb b/DP/Value Iteration Solution.ipynb
diff --git a/DP/Value Iteration.ipynb b/DP/Value Iteration.ipynb