tensorflow
diff --git a/‎Gym/PPO/ActorCritic.swift
Lines changed: 118 additions & 0 deletions b/‎Gym/PPO/ActorCritic.swift
Lines changed: 118 additions & 0 deletions
diff --git a/‎Gym/PPO/Agent.swift
Lines changed: 148 additions & 0 deletions b/‎Gym/PPO/Agent.swift
Lines changed: 148 additions & 0 deletions
diff --git a/‎Gym/PPO/Categorical.swift
Lines changed: 122 additions & 0 deletions b/‎Gym/PPO/Categorical.swift
Lines changed: 122 additions & 0 deletions
@@ -0,0 +1,118 @@
+// Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+import TensorFlow
+
+/// The actor network that returns a probability for each action.
+///
+/// Actor-Critic methods has an actor network and a critic network. The actor network is the policy
+/// of the agent: it is used to select actions.
+struct ActorNetwork: Layer {
+    typealias Input = Tensor<Float>
+    typealias Output = Tensor<Float>
+
+    var l1, l2, l3: Dense<Float>
+
+    init(observationSize: Int, hiddenSize: Int, actionCount: Int) {
+        l1 = Dense<Float>(
+            inputSize: observationSize,
+            outputSize: hiddenSize,
+            activation: tanh,
+            weightInitializer: heNormal()
+        )
+        l2 = Dense<Float>(
+            inputSize: hiddenSize,
+            outputSize: hiddenSize,
+            activation: tanh,
+            weightInitializer: heNormal()
+        )
+        l3 = Dense<Float>(
+            inputSize: hiddenSize,
+            outputSize: actionCount,
+            activation: softmax,
+            weightInitializer: heNormal()
+        )
+    }
+
+    @differentiable
+    func callAsFunction(_ input: Input) -> Output {
+        return input.sequenced(through: l1, l2, l3)
+    }
+}
+
+/// The critic network that returns the estimated value of each action, given a state.
+///
+/// Actor-Critic methods has an actor network and a critic network. The critic network is used to
+/// estimate the value of the state-action pair. With these value functions, the critic can evaluate
+/// the actions made by the actor.
+struct CriticNetwork: Layer {
+    typealias Input = Tensor<Float>
+    typealias Output = Tensor<Float>
+
+    var l1, l2, l3: Dense<Float>
+
+    init(observationSize: Int, hiddenSize: Int) {
+        l1 = Dense<Float>(
+            inputSize: observationSize,
+            outputSize: hiddenSize,
+            activation: relu,
+            weightInitializer: heNormal()
+        )
+        l2 = Dense<Float>(
+            inputSize: hiddenSize,
+            outputSize: hiddenSize,
+            activation: relu,
+            weightInitializer: heNormal()
+        )
+        l3 = Dense<Float>(
+            inputSize: hiddenSize,
+            outputSize: 1,
+            weightInitializer: heNormal()
+        )
+    }
+
+    @differentiable
+    func callAsFunction(_ input: Input) -> Output {
+        return input.sequenced(through: l1, l2, l3)
+    }
+}
+
+/// The actor-critic that contains actor and critic networks for action selection and evaluation.
+///
+/// Weight are often shared between the actor network and the critic network, but in this example,
+/// they are separated networks.
+struct ActorCritic: Layer {
+    var actorNetwork: ActorNetwork
+    var criticNetwork: CriticNetwork
+
+    init(observationSize: Int, hiddenSize: Int, actionCount: Int) {
+        self.actorNetwork = ActorNetwork(
+            observationSize: observationSize,
+            hiddenSize: hiddenSize,
+            actionCount: actionCount
+        )
+        self.criticNetwork = CriticNetwork(
+            observationSize: observationSize,
+            hiddenSize: hiddenSize
+        )
+    }
+
+    @differentiable
+    func callAsFunction(_ state: Tensor<Float>) -> Categorical<Int32> {
+        precondition(state.rank == 2, "The input must be 2-D ([batch size, state size]).")
+        let actionProbs = self.actorNetwork(state).flattened()
+        let dist = Categorical<Int32>(probabilities: actionProbs)
+        return dist
+    }
+}
@@ -0,0 +1,148 @@
+// Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+import PythonKit
+import TensorFlow
+
+/// Agent that uses the Proximal Policy Optimization (PPO).
+///
+/// Proximal Policy Optimization is an algorithm that trains an actor (policy) and a critic (value
+/// function) using a clipped objective function. The clipped objective function simplifies the
+/// update equation from its predecessor Trust Region Policy Optimization (TRPO). For more
+/// information, check Proximal Policy Optimization Algorithms (Schulman et al., 2017).
+class PPOAgent {
+    // Cache for trajectory segments for minibatch updates.
+    var memory: PPOMemory
+    /// The learning rate for both the actor and the critic.
+    let learningRate: Float
+    /// The discount factor that measures how much to weight to give to future
+    /// rewards when calculating the action value.
+    let discount: Float
+    /// Number of epochs to run minibatch updates once enough trajectory segments are collected.
+    let epochs: Int
+    /// Parameter to clip the probability ratio.
+    let clipEpsilon: Float
+    /// Coefficient for the entropy bonus added to the objective.
+    let entropyCoefficient: Float
+
+    var actorCritic: ActorCritic
+    var oldActorCritic: ActorCritic
+    var actorOptimizer: Adam<ActorNetwork>
+    var criticOptimizer: Adam<CriticNetwork>
+
+    init(
+        observationSize: Int,
+        hiddenSize: Int,
+        actionCount: Int,
+        learningRate: Float,
+        discount: Float,
+        epochs: Int,
+        clipEpsilon: Float,
+        entropyCoefficient: Float
+    ) {
+        self.learningRate = learningRate
+        self.discount = discount
+        self.epochs = epochs
+        self.clipEpsilon = clipEpsilon
+        self.entropyCoefficient = entropyCoefficient
+
+        self.memory = PPOMemory()
+
+        self.actorCritic = ActorCritic(
+            observationSize: observationSize,
+            hiddenSize: hiddenSize,
+            actionCount: actionCount
+        )
+        self.oldActorCritic = self.actorCritic
+        self.actorOptimizer = Adam(for: actorCritic.actorNetwork, learningRate: learningRate)
+        self.criticOptimizer = Adam(for: actorCritic.criticNetwork, learningRate: learningRate)
+    }
+
+    func step(env: PythonObject, state: PythonObject) -> (PythonObject, Bool, Float) {
+        let tfState: Tensor<Float> = Tensor<Float>(numpy: np.array([state], dtype: np.float32))!
+        let dist: Categorical<Int32> = oldActorCritic(tfState)
+        let action: Int32 = dist.sample().scalarized()
+        let (newState, reward, isDone, _) = env.step(action).tuple4
+
+        memory.append(
+            state: Array(state)!,
+            action: action,
+            reward: Float(reward)!,
+            logProb: dist.logProbabilities[Int(action)].scalarized(),
+            isDone: Bool(isDone)!
+        )
+
+        return (newState, Bool(isDone)!, Float(reward)!)
+    }
+
+    func update() {
+        // Discount rewards for advantage estimation
+        var rewards: [Float] = []
+        var discountedReward: Float = 0
+        for i in (0..<memory.rewards.count).reversed() {
+            if memory.isDones[i] {
+                discountedReward = 0
+            }
+            discountedReward = memory.rewards[i] + (discount * discountedReward)
+            rewards.insert(discountedReward, at: 0)
+        }
+        var tfRewards = Tensor<Float>(rewards)
+        tfRewards = (tfRewards - tfRewards.mean()) / (tfRewards.standardDeviation() + 1e-5)
+
+        // Retrieve stored states, actions, and log probabilities
+        let oldStates: Tensor<Float> = Tensor<Float>(numpy: np.array(memory.states, dtype: np.float32))!
+        let oldActions: Tensor<Int32> = Tensor<Int32>(numpy: np.array(memory.actions, dtype: np.int32))!
+        let oldLogProbs: Tensor<Float> = Tensor<Float>(numpy: np.array(memory.logProbs, dtype: np.float32))!
+
+        // Optimize actor and critic
+        var actorLosses: [Float] = []
+        var criticLosses: [Float] = []
+        for _ in 0..<epochs {
+            // Optimize policy network (actor)
+            let (actorLoss, actorGradients) = valueWithGradient(at: self.actorCritic.actorNetwork) { actorNetwork -> Tensor<Float> in
+                let npIndices = np.stack([np.arange(oldActions.shape[0], dtype: np.int32), oldActions.makeNumpyArray()], axis: 1)
+                let tfIndices = Tensor<Int32>(numpy: npIndices)!
+                let actionProbs = actorNetwork(oldStates).dimensionGathering(atIndices: tfIndices)
+
+                let dist = Categorical<Int32>(probabilities: actionProbs)
+                let stateValues = self.actorCritic.criticNetwork(oldStates).flattened()
+                let ratios: Tensor<Float> = exp(dist.logProbabilities - oldLogProbs)
+
+                let advantages: Tensor<Float> = tfRewards - stateValues
+                let surrogateObjective = Tensor(stacking: [
+                    ratios * advantages,
+                    ratios.clipped(min:1 - self.clipEpsilon, max: 1 + self.clipEpsilon) * advantages
+                ]).min(alongAxes: 0).flattened()
+                let entropyBonus: Tensor<Float> = Tensor<Float>(self.entropyCoefficient * dist.entropy())
+                let loss: Tensor<Float> = -1 * (surrogateObjective + entropyBonus)
+
+                return loss.mean()
+            }
+            self.actorOptimizer.update(&self.actorCritic.actorNetwork, along: actorGradients)
+            actorLosses.append(actorLoss.scalarized())
+
+            // Optimize value network (critic)
+            let (criticLoss, criticGradients) = valueWithGradient(at: self.actorCritic.criticNetwork) { criticNetwork -> Tensor<Float> in
+                let stateValues = criticNetwork(oldStates).flattened()
+                let loss: Tensor<Float> = 0.5 * pow(stateValues - tfRewards, 2)
+
+                return loss.mean()
+            }
+            self.criticOptimizer.update(&self.actorCritic.criticNetwork, along: criticGradients)
+            criticLosses.append(criticLoss.scalarized())
+        }
+        self.oldActorCritic = self.actorCritic
+        memory.removeAll()
+    }
+}
@@ -0,0 +1,122 @@
+// Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+import TensorFlow
+
+// Below code comes from eaplatanios/swift-rl:
+// https://github.com/eaplatanios/swift-rl/blob/master/Sources/ReinforcementLearning/Utilities/Protocols.swift
+public protocol Batchable {
+  func flattenedBatch(outerDimCount: Int) -> Self
+  func unflattenedBatch(outerDims: [Int]) -> Self
+}
+
+public protocol DifferentiableBatchable: Batchable, Differentiable {
+  @differentiable(wrt: self)
+  func flattenedBatch(outerDimCount: Int) -> Self
+
+  @differentiable(wrt: self)
+  func unflattenedBatch(outerDims: [Int]) -> Self
+}
+
+extension Tensor: Batchable {
+  public func flattenedBatch(outerDimCount: Int) -> Tensor {
+    if outerDimCount == 1 {
+      return self
+    }
+    var newShape = [-1]
+    for i in outerDimCount..<rank {
+      newShape.append(shape[i])
+    }
+    return reshaped(to: TensorShape(newShape))
+  }
+
+  public func unflattenedBatch(outerDims: [Int]) -> Tensor {
+    if rank > 1 {
+      return reshaped(to: TensorShape(outerDims + shape.dimensions[1...]))
+    }
+    return reshaped(to: TensorShape(outerDims))
+  }
+}
+
+extension Tensor: DifferentiableBatchable where Scalar: TensorFlowFloatingPoint {
+  @differentiable(wrt: self)
+  public func flattenedBatch(outerDimCount: Int) -> Tensor {
+    if outerDimCount == 1 {
+      return self
+    }
+    var newShape = [-1]
+    for i in outerDimCount..<rank {
+      newShape.append(shape[i])
+    }
+    return reshaped(to: TensorShape(newShape))
+  }
+
+  @differentiable(wrt: self)
+  public func unflattenedBatch(outerDims: [Int]) -> Tensor {
+    if rank > 1 {
+      return reshaped(to: TensorShape(outerDims + shape.dimensions[1...]))
+    }
+    return reshaped(to: TensorShape(outerDims))
+  }
+}
+
+// Below code comes from eaplatanios/swift-rl:
+// https://github.com/eaplatanios/swift-rl/blob/master/Sources/ReinforcementLearning/Distributions/Distribution.swift
+public protocol Distribution {
+  associatedtype Value
+
+  func entropy() -> Tensor<Float>
+
+  /// Returns a random sample drawn from this distribution.
+  func sample() -> Value
+}
+
+public protocol DifferentiableDistribution: Distribution, Differentiable {
+  @differentiable(wrt: self)
+  func entropy() -> Tensor<Float>
+}
+
+// Below code comes from eaplatanios/swift-rl:
+// https://github.com/eaplatanios/swift-rl/blob/master/Sources/ReinforcementLearning/Distributions/Categorical.swift
+public struct Categorical<Scalar: TensorFlowIndex>: DifferentiableDistribution, KeyPathIterable {
+  /// Log-probabilities of this categorical distribution.
+  public var logProbabilities: Tensor<Float>
+
+  @inlinable  
+  @differentiable(wrt: probabilities)
+  public init(probabilities: Tensor<Float>) {
+    self.logProbabilities = log(probabilities)
+  }
+
+  @inlinable
+  @differentiable(wrt: self)
+  public func entropy() -> Tensor<Float> {
+    -(logProbabilities * exp(logProbabilities)).sum(squeezingAxes: -1)
+  }
+
+  @inlinable
+  public func sample() -> Tensor<Scalar> {
+    let seed = Context.local.randomSeed
+    let outerDimCount = self.logProbabilities.rank - 1
+    let logProbabilities = self.logProbabilities.flattenedBatch(outerDimCount: outerDimCount)
+    let multinomial: Tensor<Scalar> = _Raw.multinomial(
+      logits: logProbabilities,
+      numSamples: Tensor<Int32>(1),
+      seed: Int64(seed.graph),
+      seed2: Int64(seed.op))
+    let flattenedSamples = multinomial.gathering(atIndices: Tensor<Int32>(0), alongAxis: 1)
+    return flattenedSamples.unflattenedBatch(
+      outerDims: [Int](self.logProbabilities.shape.dimensions[0..<outerDimCount]))
+  }
+}