AlexanderZhujiageng
diff --git a/‎DP/Policy Evaluation Solution.ipynb
Lines changed: 12 additions & 6 deletions b/‎DP/Policy Evaluation Solution.ipynb
Lines changed: 12 additions & 6 deletions
diff --git a/‎DP/Policy Evaluation.ipynb
Lines changed: 12 additions & 6 deletions b/‎DP/Policy Evaluation.ipynb
Lines changed: 12 additions & 6 deletions
diff --git a/‎DP/Policy Iteration Solution.ipynb
Lines changed: 9 additions & 5 deletions b/‎DP/Policy Iteration Solution.ipynb
Lines changed: 9 additions & 5 deletions
diff --git a/‎DP/Policy Iteration.ipynb
Lines changed: 9 additions & 5 deletions b/‎DP/Policy Iteration.ipynb
Lines changed: 9 additions & 5 deletions
diff --git a/‎DP/Value Iteration Solution.ipynb
Lines changed: 12 additions & 6 deletions b/‎DP/Value Iteration Solution.ipynb
Lines changed: 12 additions & 6 deletions
diff --git a/‎DP/Value Iteration.ipynb
Lines changed: 12 additions & 6 deletions b/‎DP/Value Iteration.ipynb
Lines changed: 12 additions & 6 deletions
diff --git a/‎MC/MC Control with Epsilon-Greedy Policies Solution.ipynb
Lines changed: 12 additions & 18 deletions b/‎MC/MC Control with Epsilon-Greedy Policies Solution.ipynb
Lines changed: 12 additions & 18 deletions
@@ -3,7 +3,9 @@
   {
    "cell_type": "code",
    "execution_count": 53,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "import numpy as np\n",
@@ -42,10 +44,10 @@
     "        policy: [S, A] shaped matrix representing the policy.\n",
     "        env: OpenAI env. env.P represents the transition probabilities of the environment.\n",
     "            env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).\n",
-    "            env.nS is a number of available states. \n",
-    "            env.nA is a number of available actions.\n",
+    "            env.nS is a number of states in the environment. \n",
+    "            env.nA is a number of actions in the environment.\n",
     "        theta: We stop evaluation once our value function change is less than theta for all states.\n",
-    "        discount_factor: gamma discount factor.\n",
+    "        discount_factor: Gamma discount factor.\n",
     "    \n",
     "    Returns:\n",
     "        Vector of length env.nS representing the value function.\n",
@@ -75,7 +77,9 @@
   {
    "cell_type": "code",
    "execution_count": 56,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "random_policy = np.ones([env.nS, env.nA]) / env.nA\n",
@@ -118,7 +122,9 @@
   {
    "cell_type": "code",
    "execution_count": 51,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "# Test: Make sure the evaluated policy is what we expected\n",
 
@@ -3,7 +3,9 @@
   {
    "cell_type": "code",
    "execution_count": 23,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "import numpy as np\n",
@@ -27,7 +29,9 @@
   {
    "cell_type": "code",
    "execution_count": 25,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "def policy_eval(policy, env, discount_factor=1.0, theta=0.00001):\n",
@@ -38,10 +42,10 @@
     "        policy: [S, A] shaped matrix representing the policy.\n",
     "        env: OpenAI env. env.P represents the transition probabilities of the environment.\n",
     "            env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).\n",
-    "            env.nS is a number of available states. \n",
-    "            env.nA is a number of available actions.\n",
+    "            env.nS is a number of states in the environment. \n",
+    "            env.nA is a number of actions in the environment.\n",
     "        theta: We stop evaluation once our value function change is less than theta for all states.\n",
-    "        discount_factor: gamma discount factor.\n",
+    "        discount_factor: Gamma discount factor.\n",
     "    \n",
     "    Returns:\n",
     "        Vector of length env.nS representing the value function.\n",
@@ -57,7 +61,9 @@
   {
    "cell_type": "code",
    "execution_count": 26,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "random_policy = np.ones([env.nS, env.nA]) / env.nA\n",
 
@@ -3,7 +3,9 @@
   {
    "cell_type": "code",
    "execution_count": 1,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "import numpy as np\n",
@@ -44,10 +46,10 @@
     "        policy: [S, A] shaped matrix representing the policy.\n",
     "        env: OpenAI env. env.P represents the transition probabilities of the environment.\n",
     "            env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).\n",
-    "            env.nS is a number of available states. \n",
-    "            env.nA is a number of available actions.\n",
+    "            env.nS is a number of states in the environment. \n",
+    "            env.nA is a number of actions in the environment.\n",
     "        theta: We stop evaluation once our value function change is less than theta for all states.\n",
-    "        discount_factor: gamma discount factor.\n",
+    "        discount_factor: Gamma discount factor.\n",
     "    \n",
     "    Returns:\n",
     "        Vector of length env.nS representing the value function.\n",
@@ -201,7 +203,9 @@
   {
    "cell_type": "code",
    "execution_count": 59,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "# Test the value function\n",
 
@@ -3,7 +3,9 @@
   {
    "cell_type": "code",
    "execution_count": 5,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "import numpy as np\n",
@@ -44,10 +46,10 @@
     "        policy: [S, A] shaped matrix representing the policy.\n",
     "        env: OpenAI env. env.P represents the transition probabilities of the environment.\n",
     "            env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).\n",
-    "            env.nS is a number of available states. \n",
-    "            env.nA is a number of available actions.\n",
+    "            env.nS is a number of states in the environment. \n",
+    "            env.nA is a number of actions in the environment.\n",
     "        theta: We stop evaluation once our value function change is less than theta for all states.\n",
-    "        discount_factor: gamma discount factor.\n",
+    "        discount_factor: Gamma discount factor.\n",
     "    \n",
     "    Returns:\n",
     "        Vector of length env.nS representing the value function.\n",
@@ -77,7 +79,9 @@
   {
    "cell_type": "code",
    "execution_count": 13,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "def policy_improvement(env, policy_eval_fn=policy_eval, discount_factor=1.0):\n",
 
@@ -3,7 +3,9 @@
   {
    "cell_type": "code",
    "execution_count": 17,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "import numpy as np\n",
@@ -17,7 +19,9 @@
   {
    "cell_type": "code",
    "execution_count": 18,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "pp = pprint.PrettyPrinter(indent=2)\n",
@@ -27,7 +31,9 @@
   {
    "cell_type": "code",
    "execution_count": 19,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "def value_iteration(env, theta=0.0001, discount_factor=1.0):\n",
@@ -37,10 +43,10 @@
     "    Args:\n",
     "        env: OpenAI env. env.P represents the transition probabilities of the environment.\n",
     "            env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).\n",
-    "            env.nS is a number of available states. \n",
-    "            env.nA is a number of available actions.\n",
+    "            env.nS is a number of states in the environment. \n",
+    "            env.nA is a number of actions in the environment.\n",
     "        theta: We stop evaluation once our value function change is less than theta for all states.\n",
-    "        discount_factor: gamma discount factor.\n",
+    "        discount_factor: Gamma discount factor.\n",
     "        \n",
     "    Returns:\n",
     "        A tuple (policy, V) of the optimal policy and the optimal value function.\n",
 
@@ -3,7 +3,9 @@
   {
    "cell_type": "code",
    "execution_count": 3,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "import numpy as np\n",
@@ -17,7 +19,9 @@
   {
    "cell_type": "code",
    "execution_count": 4,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "pp = pprint.PrettyPrinter(indent=2)\n",
@@ -27,7 +31,9 @@
   {
    "cell_type": "code",
    "execution_count": 5,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "def value_iteration(env, theta=0.0001, discount_factor=1.0):\n",
@@ -37,10 +43,10 @@
     "    Args:\n",
     "        env: OpenAI env. env.P represents the transition probabilities of the environment.\n",
     "            env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).\n",
-    "            env.nS is a number of available states. \n",
-    "            env.nA is a number of available actions.\n",
+    "            env.nS is a number of states in the environment. \n",
+    "            env.nA is a number of actions in the environment.\n",
     "        theta: We stop evaluation once our value function change is less than theta for all states.\n",
-    "        discount_factor: gamma discount factor.\n",
+    "        discount_factor: Gamma discount factor.\n",
     "        \n",
     "    Returns:\n",
     "        A tuple (policy, V) of the optimal policy and the optimal value function.        \n",
 
@@ -27,9 +27,7 @@
   {
    "cell_type": "code",
    "execution_count": 2,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "env = BlackjackEnv()"
@@ -81,14 +79,14 @@
     "    \n",
     "    Args:\n",
     "        env: OpenAI gym environment.\n",
-    "        num_episodes: Nubmer of episodes to sample.\n",
-    "        discount_factor: Lambda discount factor.\n",
+    "        num_episodes: Number of episodes to sample.\n",
+    "        discount_factor: Gamma discount factor.\n",
     "        epsilon: Chance the sample a random action. Float betwen 0 and 1.\n",
     "    \n",
     "    Returns:\n",
     "        A tuple (Q, policy).\n",
     "        Q is a dictionary mapping state -> action values.\n",
-    "        policy is a function taht takes an observation as an argument and returns\n",
+    "        policy is a function that takes an observation as an argument and returns\n",
     "        action probabilities\n",
     "    \"\"\"\n",
     "    \n",
@@ -147,9 +145,7 @@
   {
    "cell_type": "code",
    "execution_count": 5,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
@@ -166,9 +162,7 @@
   {
    "cell_type": "code",
    "execution_count": 6,
-   "metadata": {
-    "collapsed": false
-   },
+   "metadata": {},
    "outputs": [
     {
      "data": {
@@ -213,23 +207,23 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 2",
    "language": "python",
-   "name": "python3"
+   "name": "python2"
   },
   "language_info": {
    "codemirror_mode": {
     "name": "ipython",
-    "version": 3
+    "version": 2
    },
    "file_extension": ".py",
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.5.2"
+   "pygments_lexer": "ipython2",
+   "version": "2.7.12"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 0
+ "nbformat_minor": 1
 }