Make WordSeg benchmarks support x10 as an option (#593)

asuhan · web-flow · commit def2eb43477f · 2020-06-10T09:21:06.000-07:00
Also convert WordSeg example to x10.
diff --git a/Benchmarks/Models/WordSeg.swift b/Benchmarks/Models/WordSeg.swift
@@ -78,9 +78,9 @@ let maximumSequenceLength = 18
 struct WordSegBenchmark: Benchmark {
     let batchSize: Int
     let duration: BenchmarkDuration
-    let operation: (SNLM, CharacterSequence) -> ()
+    let operation: (SNLM, CharacterSequence, Device) -> ()
 
-    init(settings: BenchmarkSettings, operation: @escaping (SNLM, CharacterSequence) -> ()) {
+    init(settings: BenchmarkSettings, operation: @escaping (SNLM, CharacterSequence, Device) -> ()) {
         self.duration = settings.duration
         self.batchSize = settings.batchSize
         self.operation = operation
@@ -129,7 +129,7 @@ struct WordSegBenchmark: Benchmark {
             }
             
             for _ in 0..<iterations {
-                operation(model, sentence)
+                operation(model, sentence, device)
                 LazyTensorBarrier()
                 
                 batchTimings.append(durationInMilliseconds(since: beforeBatch))
@@ -152,26 +152,26 @@ extension WordSegBenchmark {
         return try CharacterSequence(alphabet: alphabet, appendingEoSTo: truncatedSentence)
     }
 
-    static func score(model: SNLM, sentence: CharacterSequence) {
-        let lattice = model.buildLattice(sentence, maxLen: maximumSequenceLength)
+    static func score(model: SNLM, sentence: CharacterSequence, device: Device) {
+        let lattice = model.buildLattice(sentence, maxLen: maximumSequenceLength, device: device)
         let score = lattice[sentence.count].semiringScore
         let _ = score.logr + score.logp
     }
 
-    static func scoreAndGradient(model: SNLM, sentence: CharacterSequence) {
+    static func scoreAndGradient(model: SNLM, sentence: CharacterSequence, device: Device) {
         let lambd: Float = 0.00075
 
-        let _ = valueWithGradient(at: model) { model -> Float in
-          let lattice = model.buildLattice(sentence, maxLen: maximumSequenceLength)
+        let _ = valueWithGradient(at: model) { model -> Tensor<Float> in
+          let lattice = model.buildLattice(sentence, maxLen: maximumSequenceLength, device: device)
           let score = lattice[sentence.count].semiringScore
           let expectedLength = exp(score.logr - score.logp)
           let loss = -1 * score.logp + lambd * expectedLength
-          return loss
+          return Tensor(loss, on: device)
         }
     }
 
-    static func viterbi(model: SNLM, sentence: CharacterSequence) {
-        var lattice = model.buildLattice(sentence, maxLen: maximumSequenceLength)
+    static func viterbi(model: SNLM, sentence: CharacterSequence, device: Device) {
+        var lattice = model.buildLattice(sentence, maxLen: maximumSequenceLength, device: device)
         let _ = lattice.viterbi(sentence: sentence)
     }
 }
diff --git a/Examples/WordSeg/main.swift b/Examples/WordSeg/main.swift
@@ -66,9 +66,13 @@ let modelParameters = SNLM.Parameters(
   order: order
 )
 
+let device = Device.defaultXLA
+
 var model = SNLM(parameters: modelParameters)
+model.move(to: device)
 
-let optimizer = Adam(for: model, learningRate: learningRate)
+var optimizer = Adam(for: model, learningRate: learningRate)
+optimizer = Adam(copying: optimizer, to: device)
 
 print("Starting training...")
 
@@ -78,18 +82,18 @@ for epoch in 1...maxEpochs {
   var trainingBatchCount = 0
   for record in dataset.training {
     let sentence = record.numericalizedText
-    let (loss, gradients) = valueWithGradient(at: model) { model -> Float in
-      let lattice = model.buildLattice(sentence, maxLen: maxLength)
+    let (loss, gradients) = valueWithGradient(at: model) { model -> Tensor<Float> in
+      let lattice = model.buildLattice(sentence, maxLen: maxLength, device: device)
       let score = lattice[sentence.count].semiringScore
       let expectedLength = exp(score.logr - score.logp)
       let loss = -1 * score.logp + lambd * expectedLength
-      return loss
+      return Tensor(loss, on: device)
     }
 
-    trainingLossSum += loss
+    trainingLossSum += loss.scalarized()
     trainingBatchCount += 1
     optimizer.update(&model, along: gradients)
-
+    LazyTensorBarrier()
     if hasNaN(gradients) {
       print("Warning: grad has NaN")
     }
@@ -129,7 +133,7 @@ for epoch in 1...maxEpochs {
   var validationPlainText: String = ""
   for record in validationDataset {
     let sentence = record.numericalizedText
-    var lattice = model.buildLattice(sentence, maxLen: maxLength)
+    var lattice = model.buildLattice(sentence, maxLen: maxLength, device: device)
     let score = lattice[sentence.count].semiringScore
 
     validationLossSum -= score.logp
diff --git a/Models/Text/WordSeg/Model.swift b/Models/Text/WordSeg/Model.swift
@@ -114,8 +114,8 @@ public struct SNLM: EuclideanDifferentiable, KeyPathIterable {
 
   // MARK: - Encode
   /// Returns the hidden states of the encoder LSTM applied to the given sentence.
-  public func encode(_ x: CharacterSequence) -> [Tensor<Float>] {
-    var embedded = encoderEmbedding(x.tensor)
+  public func encode(_ x: CharacterSequence, device: Device) -> [Tensor<Float>] {
+    var embedded = encoderEmbedding(x.tensor(device: device))
     embedded = dropout(embedded)
     let encoderStates = encoderLSTM(embedded.unstacked().differentiableMap { $0.rankLifted() })
     var encoderResult = Tensor(
@@ -126,7 +126,9 @@ public struct SNLM: EuclideanDifferentiable, KeyPathIterable {
 
   // MARK: - Decode
   /// Returns log probabilities for each of the candidates.
-  public func decode(_ candidates: [CharacterSequence], _ state: Tensor<Float>) -> Tensor<Float> {
+  public func decode(_ candidates: [CharacterSequence], _ state: Tensor<Float>, device: Device)
+    -> Tensor<Float>
+  {
     // TODO(TF-433): Remove closure workaround when autodiff supports non-active rethrowing
     // functions (`Array.map`).
     let maxLen = { candidates.map { $0.count }.max()! + 1 }()
@@ -148,21 +150,25 @@ public struct SNLM: EuclideanDifferentiable, KeyPathIterable {
 
     // Shapes are [time x batch] so that we can unstack the time dimension into the array that
     // the LSTM wants as input.
-    let x: Tensor<Int32> = Tensor(shape: [candidates.count, maxLen], scalars: xBatch).transposed()
-    let y: Tensor<Int32> = Tensor(shape: [candidates.count, maxLen], scalars: yBatch).transposed()
+    let x: Tensor<Int32> = Tensor(
+      shape: [candidates.count, maxLen], scalars: xBatch, on: device
+    ).transposed()
+    let y: Tensor<Int32> = Tensor(
+      shape: [candidates.count, maxLen], scalars: yBatch, on: device
+    ).transposed()
 
     // [time x batch x ndim]
     var embeddedX = decoderEmbedding(x)
     embeddedX = dropout(embeddedX)
 
     // [batch x ndim]
-    let stateBatch = state.rankLifted().tiled(multiples: Tensor([Int32(candidates.count), 1]))
+    let stateBatch = state.rankLifted().tiled(multiples: [candidates.count, 1])
 
     // [time] array of LSTM states whose `hidden` and `cell` fields have shape [batch x ndim]
     let decoderStates = decoderLSTM(
       embeddedX.unstacked(),
       initialState: LSTMCell.State(
-        cell: Tensor(zeros: stateBatch.shape),
+        cell: Tensor(zeros: stateBatch.shape, on: device),
         hidden: stateBatch))
 
     // [time x batch x ndim]
@@ -183,7 +189,11 @@ public struct SNLM: EuclideanDifferentiable, KeyPathIterable {
       ).reshaped(to: y.shape)
 
     // [time x batch]
-    let logpExcludingPad = logp * Tensor<Float>(y .!= parameters.chrVocab.pad)
+    let padScalars = [Int32](repeating: parameters.chrVocab.pad, count: candidates.count * maxLen)
+    let noPad = Tensor<Int32>(
+      y .!= Tensor(shape: y.shape, scalars: padScalars, on: device))
+    let noPadFloat = Tensor<Float>(noPad)
+    let logpExcludingPad = logp * noPadFloat
 
     // [batch]
     let candidateLogP = logpExcludingPad.transposed().sum(squeezingAxes: 1)
@@ -200,9 +210,9 @@ public struct SNLM: EuclideanDifferentiable, KeyPathIterable {
   }
 
   @differentiable
-  public func buildLattice(_ sentence: CharacterSequence, maxLen: Int) -> Lattice {
+  public func buildLattice(_ sentence: CharacterSequence, maxLen: Int, device: Device) -> Lattice {
     var lattice = Lattice(count: sentence.count)
-    let states = encode(sentence)
+    let states = encode(sentence, device: device)
     let logg_batch = mlpInterpolation(Tensor(stacking: states))
     let logp_lex_batch = mlpMemory(Tensor(stacking: states))
     for pos in 0..<sentence.count {
@@ -225,9 +235,10 @@ public struct SNLM: EuclideanDifferentiable, KeyPathIterable {
       }
 
       let current_state = states[pos]
-      let logg = logg_batch[pos].scalarsADHack  // [2]
-      let logp_lex = logp_lex_batch[pos].scalarsADHack  // [strVocab.chr.count]
-      let logp_chr = decode(candidates, current_state).scalarsADHack  // [candidates.count]
+      let logg = logg_batch[pos].scalarsADHack(device: device)  // [2]
+      let logp_lex = logp_lex_batch[pos].scalarsADHack(device: device)  // [strVocab.chr.count]
+      let logp_chr = decode(candidates, current_state, device: device)
+        .scalarsADHack(device: device)  // [candidates.count]
       if pos != 0 {
         // Cleanup: lattice[pos].recomputeSemiringScore()
         var updatedNode = lattice[pos]
@@ -315,23 +326,22 @@ extension Tensor {
   // (`Differentiable.zeroTangentVectorInitializer`) instead of static zeros
   // (`AdditiveArithmetic.zero`).
   @differentiable(where Scalar: TensorFlowFloatingPoint)
-  var scalarsADHack: [Scalar] {
+  func scalarsADHack(device: Device) -> [Scalar] {
     scalars
   }
 
   @derivative(of: scalarsADHack)
-  func vjpScalarsADHack() -> (
+  func vjpScalarsADHack(device: Device) -> (
     value: [Scalar], pullback: (Array<Scalar>.TangentVector) -> Tensor
   ) where Scalar: TensorFlowFloatingPoint {
     // In the pullback: capture only `self.shape`, not all of `self`.
     let shape = self.shape
     func pullback(_ tv: Array<Scalar>.TangentVector) -> Tensor {
       if tv.count == 0 {
-        return Tensor(zeros: shape)
+        return Tensor(zeros: shape, on: device)
       }
-      return Tensor(shape: shape, scalars: tv.base)
+      return Tensor(shape: shape, scalars: tv.base, on: device)
     }
     return (scalars, pullback)
   }
 }
- 
diff --git a/Support/Text/WordSeg/CharacterSequence.swift b/Support/Text/WordSeg/CharacterSequence.swift
@@ -55,11 +55,12 @@ public struct CharacterSequence: Hashable {
     return characters[range]
   }
 
+  public func tensor(device: Device) -> Tensor<Int32> {
+    Tensor<Int32>([self.eos] + characters[0..<characters.count - 1], on: device)
+  }
+
   public var count: Int { return characters.count }
   public var last: Int32? { return characters.last }
-  public var tensor: Tensor<Int32> {
-    Tensor<Int32>([self.eos] + characters[0..<characters.count - 1])
-  }
 }
 
 extension CharacterSequence: CustomStringConvertible {
diff --git a/Tests/TextTests/WordSegmentationTests/ProbeLayers.swift b/Tests/TextTests/WordSegmentationTests/ProbeLayers.swift
@@ -155,10 +155,11 @@ class WordSegProbeLayerTests: XCTestCase {
         order: 5))
 
     model.setParameters(Example1.parameters)
+    let device = Device.default
 
     print("Encoding")
     let encoderStates = model.encode(
-      CharacterSequence(alphabet: chrVocab, characters: [0, 1, 0, 1]))  // "abab"
+      CharacterSequence(alphabet: chrVocab, characters: [0, 1, 0, 1]), device: device)  // "abab"
     let encoderStatesTensor = Tensor(stacking: encoderStates)
     print("Expected: \(Example1.expectedEncoding)")
     print("Actual: \(encoderStatesTensor)")
@@ -187,7 +188,8 @@ class WordSegProbeLayerTests: XCTestCase {
         CharacterSequence(alphabet: chrVocab, characters: [0, 0, 0]),  // "aaa"
         CharacterSequence(alphabet: chrVocab, characters: [0, 1]),  // "ab"
       ],
-      encoderStates[0]
+      encoderStates[0],
+      device: device
     )
     print("Expected: \(Example1.expectedDecoded)")
     print("Actual: \(decoded)")
@@ -196,12 +198,12 @@ class WordSegProbeLayerTests: XCTestCase {
 
     print("Build Lattice")
     let abab = CharacterSequence(alphabet: chrVocab, characters: [0, 1, 0, 1])
-    let lattice = model.buildLattice(abab, maxLen: 5)
+    let lattice = model.buildLattice(abab, maxLen: 5, device: device)
     XCTAssert(lattice.isAlmostEqual(to: Example1.lattice, tolerance: 1e-5))
 
     print("Gradient")
     func f(_ x: SNLM) -> Float {
-      x.buildLattice(abab, maxLen: 5)[4].semiringScore.logr
+      x.buildLattice(abab, maxLen: 5, device: device)[4].semiringScore.logr
     }
     let (_, grad) = valueWithGradient(at: model, in: f)
     let expectedGrad = tangentVector(from: Example1.gradWrtLogR, model: model)

Original file line number	Diff line number	Diff line change
`@@ -55,11 +55,12 @@ public struct CharacterSequence: Hashable {`
`55`	`55`	`return characters[range]`
`56`	`56`	`}`
`57`	`57`
	`58`	`+ public func tensor(device: Device) -> Tensor<Int32> {`
	`59`	`+ Tensor<Int32>([self.eos] + characters[0..<characters.count - 1], on: device)`
	`60`	`+ }`
	`61`	`+`
`58`	`62`	`public var count: Int { return characters.count }`
`59`	`63`	`public var last: Int32? { return characters.last }`
`60`		`- public var tensor: Tensor<Int32> {`
`61`		`- Tensor<Int32>([self.eos] + characters[0..<characters.count - 1])`
`62`		`- }`
`63`	`64`	`}`
`64`	`65`
`65`	`66`	`extension CharacterSequence: CustomStringConvertible {`