|
| 1 | +// Copyright 2020 The TensorFlow Authors. All Rights Reserved. |
| 2 | +// |
| 3 | +// Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | +// you may not use this file except in compliance with the License. |
| 5 | +// You may obtain a copy of the License at |
| 6 | +// |
| 7 | +// http://www.apache.org/licenses/LICENSE-2.0 |
| 8 | +// |
| 9 | +// Unless required by applicable law or agreed to in writing, software |
| 10 | +// distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | +// See the License for the specific language governing permissions and |
| 13 | +// limitations under the License. |
| 14 | + |
| 15 | +import PythonKit |
| 16 | +import TensorFlow |
| 17 | + |
| 18 | +/// Agent that uses the Proximal Policy Optimization (PPO). |
| 19 | +/// |
| 20 | +/// Proximal Policy Optimization is an algorithm that trains an actor (policy) and a critic (value |
| 21 | +/// function) using a clipped objective function. The clipped objective function simplifies the |
| 22 | +/// update equation from its predecessor Trust Region Policy Optimization (TRPO). For more |
| 23 | +/// information, check Proximal Policy Optimization Algorithms (Schulman et al., 2017). |
| 24 | +class PPOAgent { |
| 25 | + // Cache for trajectory segments for minibatch updates. |
| 26 | + var memory: PPOMemory |
| 27 | + /// The learning rate for both the actor and the critic. |
| 28 | + let learningRate: Float |
| 29 | + /// The discount factor that measures how much to weight to give to future |
| 30 | + /// rewards when calculating the action value. |
| 31 | + let discount: Float |
| 32 | + /// Number of epochs to run minibatch updates once enough trajectory segments are collected. |
| 33 | + let epochs: Int |
| 34 | + /// Parameter to clip the probability ratio. |
| 35 | + let clipEpsilon: Float |
| 36 | + /// Coefficient for the entropy bonus added to the objective. |
| 37 | + let entropyCoefficient: Float |
| 38 | + |
| 39 | + var actorCritic: ActorCritic |
| 40 | + var oldActorCritic: ActorCritic |
| 41 | + var actorOptimizer: Adam<ActorNetwork> |
| 42 | + var criticOptimizer: Adam<CriticNetwork> |
| 43 | + |
| 44 | + init( |
| 45 | + observationSize: Int, |
| 46 | + hiddenSize: Int, |
| 47 | + actionCount: Int, |
| 48 | + learningRate: Float, |
| 49 | + discount: Float, |
| 50 | + epochs: Int, |
| 51 | + clipEpsilon: Float, |
| 52 | + entropyCoefficient: Float |
| 53 | + ) { |
| 54 | + self.learningRate = learningRate |
| 55 | + self.discount = discount |
| 56 | + self.epochs = epochs |
| 57 | + self.clipEpsilon = clipEpsilon |
| 58 | + self.entropyCoefficient = entropyCoefficient |
| 59 | + |
| 60 | + self.memory = PPOMemory() |
| 61 | + |
| 62 | + self.actorCritic = ActorCritic( |
| 63 | + observationSize: observationSize, |
| 64 | + hiddenSize: hiddenSize, |
| 65 | + actionCount: actionCount |
| 66 | + ) |
| 67 | + self.oldActorCritic = self.actorCritic |
| 68 | + self.actorOptimizer = Adam(for: actorCritic.actorNetwork, learningRate: learningRate) |
| 69 | + self.criticOptimizer = Adam(for: actorCritic.criticNetwork, learningRate: learningRate) |
| 70 | + } |
| 71 | + |
| 72 | + func step(env: PythonObject, state: PythonObject) -> (PythonObject, Bool, Float) { |
| 73 | + let tfState: Tensor<Float> = Tensor<Float>(numpy: np.array([state], dtype: np.float32))! |
| 74 | + let dist: Categorical<Int32> = oldActorCritic(tfState) |
| 75 | + let action: Int32 = dist.sample().scalarized() |
| 76 | + let (newState, reward, isDone, _) = env.step(action).tuple4 |
| 77 | + |
| 78 | + memory.append( |
| 79 | + state: Array(state)!, |
| 80 | + action: action, |
| 81 | + reward: Float(reward)!, |
| 82 | + logProb: dist.logProbabilities[Int(action)].scalarized(), |
| 83 | + isDone: Bool(isDone)! |
| 84 | + ) |
| 85 | + |
| 86 | + return (newState, Bool(isDone)!, Float(reward)!) |
| 87 | + } |
| 88 | + |
| 89 | + func update() { |
| 90 | + // Discount rewards for advantage estimation |
| 91 | + var rewards: [Float] = [] |
| 92 | + var discountedReward: Float = 0 |
| 93 | + for i in (0..<memory.rewards.count).reversed() { |
| 94 | + if memory.isDones[i] { |
| 95 | + discountedReward = 0 |
| 96 | + } |
| 97 | + discountedReward = memory.rewards[i] + (discount * discountedReward) |
| 98 | + rewards.insert(discountedReward, at: 0) |
| 99 | + } |
| 100 | + var tfRewards = Tensor<Float>(rewards) |
| 101 | + tfRewards = (tfRewards - tfRewards.mean()) / (tfRewards.standardDeviation() + 1e-5) |
| 102 | + |
| 103 | + // Retrieve stored states, actions, and log probabilities |
| 104 | + let oldStates: Tensor<Float> = Tensor<Float>(numpy: np.array(memory.states, dtype: np.float32))! |
| 105 | + let oldActions: Tensor<Int32> = Tensor<Int32>(numpy: np.array(memory.actions, dtype: np.int32))! |
| 106 | + let oldLogProbs: Tensor<Float> = Tensor<Float>(numpy: np.array(memory.logProbs, dtype: np.float32))! |
| 107 | + |
| 108 | + // Optimize actor and critic |
| 109 | + var actorLosses: [Float] = [] |
| 110 | + var criticLosses: [Float] = [] |
| 111 | + for _ in 0..<epochs { |
| 112 | + // Optimize policy network (actor) |
| 113 | + let (actorLoss, actorGradients) = valueWithGradient(at: self.actorCritic.actorNetwork) { actorNetwork -> Tensor<Float> in |
| 114 | + let npIndices = np.stack([np.arange(oldActions.shape[0], dtype: np.int32), oldActions.makeNumpyArray()], axis: 1) |
| 115 | + let tfIndices = Tensor<Int32>(numpy: npIndices)! |
| 116 | + let actionProbs = actorNetwork(oldStates).dimensionGathering(atIndices: tfIndices) |
| 117 | + |
| 118 | + let dist = Categorical<Int32>(probabilities: actionProbs) |
| 119 | + let stateValues = self.actorCritic.criticNetwork(oldStates).flattened() |
| 120 | + let ratios: Tensor<Float> = exp(dist.logProbabilities - oldLogProbs) |
| 121 | + |
| 122 | + let advantages: Tensor<Float> = tfRewards - stateValues |
| 123 | + let surrogateObjective = Tensor(stacking: [ |
| 124 | + ratios * advantages, |
| 125 | + ratios.clipped(min:1 - self.clipEpsilon, max: 1 + self.clipEpsilon) * advantages |
| 126 | + ]).min(alongAxes: 0).flattened() |
| 127 | + let entropyBonus: Tensor<Float> = Tensor<Float>(self.entropyCoefficient * dist.entropy()) |
| 128 | + let loss: Tensor<Float> = -1 * (surrogateObjective + entropyBonus) |
| 129 | + |
| 130 | + return loss.mean() |
| 131 | + } |
| 132 | + self.actorOptimizer.update(&self.actorCritic.actorNetwork, along: actorGradients) |
| 133 | + actorLosses.append(actorLoss.scalarized()) |
| 134 | + |
| 135 | + // Optimize value network (critic) |
| 136 | + let (criticLoss, criticGradients) = valueWithGradient(at: self.actorCritic.criticNetwork) { criticNetwork -> Tensor<Float> in |
| 137 | + let stateValues = criticNetwork(oldStates).flattened() |
| 138 | + let loss: Tensor<Float> = 0.5 * pow(stateValues - tfRewards, 2) |
| 139 | + |
| 140 | + return loss.mean() |
| 141 | + } |
| 142 | + self.criticOptimizer.update(&self.actorCritic.criticNetwork, along: criticGradients) |
| 143 | + criticLosses.append(criticLoss.scalarized()) |
| 144 | + } |
| 145 | + self.oldActorCritic = self.actorCritic |
| 146 | + memory.removeAll() |
| 147 | + } |
| 148 | +} |
0 commit comments