Use PutRecords to batch up failed inserts sent to kinesis (close #239)

istreeter · istreeter · commit 604e31fd17c7 · 2022-09-11T21:32:27.000+01:00
diff --git a/config/config.kinesis.reference.hocon b/config/config.kinesis.reference.hocon
@@ -139,6 +139,15 @@
       # Optional endpoint url configuration to override aws kinesis endpoints,
       # this can be used to specify local endpoints when using localstack
       "customEndpoint": "127.0.0.1:7846"
+
+      # Optional. Limits the number of events in a single PutRecords request.
+      # Maximum allowed: 500
+      "recordLimit": 500
+
+      # Optional. Limits the number of bytes in a single PutRecords request,
+      # including records and partition keys.
+      # Maximum allowed: 5 MB
+      "byteLimit": 5242880
     }
   }
 
diff --git a/core/src/main/resources/application.conf b/core/src/main/resources/application.conf
@@ -25,10 +25,14 @@
         "signing": false
       }
     }
+    "bad": {
+      "recordLimit": 500
+      "byteLimit": 5242880
+    }
   }
   "monitoring": {
     "metrics": {
       "cloudWatch": true
     }
   }
-}
+}
diff --git a/core/src/main/scala/com.snowplowanalytics.stream/loader/Config.scala b/core/src/main/scala/com.snowplowanalytics.stream/loader/Config.scala
@@ -147,7 +147,9 @@ object Config {
       final case class Kinesis(
         streamName: String,
         region: Region,
-        customEndpoint: Option[String]
+        customEndpoint: Option[String],
+        recordLimit: Int,
+        byteLimit: Int
       ) extends BadSink
     }
   }
diff --git a/core/src/main/scala/com.snowplowanalytics.stream/loader/Emitter.scala b/core/src/main/scala/com.snowplowanalytics.stream/loader/Emitter.scala
@@ -71,10 +71,10 @@ class Emitter(
       // Send all valid records to stdout / Sink and return those rejected by it
       val rejects = goodSink match {
         case Left(s) =>
-          validRecords.foreach {
-            case (_, Validated.Valid(r)) => s.store(r.json.toString, None, true)
-            case _                       => ()
+          val validStrings = validRecords.collect { case (_, Validated.Valid(r)) =>
+            r.json.toString
           }
+          s.store(validStrings, true)
           Nil
         case Right(_) if validRecords.isEmpty => Nil
         case Right(sender)                    => emit(validRecords, sender)
@@ -164,11 +164,9 @@ class Emitter(
    * @param records List of failed records
    */
   override def fail(records: JList[EmitterJsonInput]): Unit = {
-    records.asScala.foreach {
-      case (r, Validated.Invalid(fs)) =>
-        val badRow = createBadRow(r, fs)
-        badSink.store(badRow.compact, None, false)
-      case (_, Validated.Valid(_)) => ()
+    val badRows = records.asScala.toList.collect { case (r, Validated.Invalid(fs)) =>
+      createBadRow(r, fs).compact
     }
+    badSink.store(badRows, false)
   }
 }
diff --git a/core/src/main/scala/com.snowplowanalytics.stream/loader/executors/StdinExecutor.scala b/core/src/main/scala/com.snowplowanalytics.stream/loader/executors/StdinExecutor.scala
@@ -48,10 +48,10 @@ class StdinExecutor(
   def run = for (ln <- scala.io.Source.stdin.getLines) {
     val (line, result) = transformer.consumeLine(ln)
     result.bimap(
-      f => badSink.store(createBadRow(line, f).compact, None, false),
+      f => badSink.store(List(createBadRow(line, f).compact), false),
       s =>
         goodSink match {
-          case Left(gs)      => gs.store(s.json.toString, None, true)
+          case Left(gs)      => gs.store(List(s.json.toString), true)
           case Right(sender) => sender.send(List(ln -> s.valid))
         }
     )
diff --git a/core/src/main/scala/com.snowplowanalytics.stream/loader/sinks/ISink.scala b/core/src/main/scala/com.snowplowanalytics.stream/loader/sinks/ISink.scala
@@ -23,12 +23,12 @@ package sinks
  * Shared interface for all sinks
  */
 trait ISink {
-  def store(output: String, key: Option[String], good: Boolean): Unit
+  def store(outputs: List[String], good: Boolean): Unit
 }
 
 /**
  * Sink which ignores all input
  */
 class NullSink extends ISink {
-  def store(output: String, key: Option[String], good: Boolean): Unit = ()
+  def store(outputs: List[String], good: Boolean): Unit = ()
 }
diff --git a/core/src/main/scala/com.snowplowanalytics.stream/loader/sinks/KinesisSink.scala b/core/src/main/scala/com.snowplowanalytics.stream/loader/sinks/KinesisSink.scala
@@ -28,6 +28,7 @@ import org.slf4j.LoggerFactory
 
 // Scala
 import scala.util.{Failure, Random, Success, Try}
+import scala.collection.JavaConverters._
 
 // Amazon
 import com.amazonaws.services.kinesis.model._
@@ -43,6 +44,7 @@ import com.snowplowanalytics.stream.loader.Config.Sink.BadSink.{Kinesis => Kines
  * @param conf Config for Kinesis sink
  */
 class KinesisSink(conf: KinesisSinkConfig) extends ISink {
+  import KinesisSink._
 
   private lazy val log = LoggerFactory.getLogger(getClass)
 
@@ -78,40 +80,44 @@ class KinesisSink(conf: KinesisSinkConfig) extends ISink {
       case rnfe: ResourceNotFoundException => false
     }
 
-  private def put(name: String, data: ByteBuffer, key: String): PutRecordResult = {
-    val putRecordRequest = {
-      val p = new PutRecordRequest()
-      p.setStreamName(name)
-      p.setData(data)
-      p.setPartitionKey(key)
-      p
+  private def put(name: String, keyedData: List[KeyedData]): PutRecordsResult = {
+    val prres = keyedData.map { case (key, data) =>
+      new PutRecordsRequestEntry()
+        .withPartitionKey(key)
+        .withData(ByteBuffer.wrap(data))
     }
-    client.putRecord(putRecordRequest)
+    val putRecordsRequest =
+      new PutRecordsRequest()
+        .withStreamName(name)
+        .withRecords(prres.asJava)
+    client.putRecords(putRecordsRequest)
   }
 
   /**
-   * Write a record to the Kinesis stream
+   * Write records to the Kinesis stream
    *
-   * @param output The string record to write
-   * @param key A hash of the key determines to which shard the
-   *            record is assigned. Defaults to a random string.
+   * @param outputs The string records to write
    * @param good Unused parameter which exists to extend ISink
    */
-  def store(output: String, key: Option[String], good: Boolean): Unit =
-    Try {
-      put(
-        conf.streamName,
-        ByteBuffer.wrap(output.getBytes(UTF_8)),
-        key.getOrElse(Random.nextInt.toString)
-      )
-    } match {
-      case Success(result) =>
-        log.info("Writing successful")
-        log.info(s"  + ShardId: ${result.getShardId}")
-        log.info(s"  + SequenceNumber: ${result.getSequenceNumber}")
-      case Failure(f) =>
-        log.error("Writing failed")
-        log.error("  + " + f.getMessage)
+  def store(outputs: List[String], good: Boolean): Unit =
+    groupOutputs(conf.recordLimit, conf.byteLimit) {
+      outputs.map(s => Random.nextInt.toString -> s.getBytes(UTF_8))
+    }.foreach { keyedData =>
+      Try {
+        put(
+          conf.streamName,
+          keyedData
+        )
+      } match {
+        case Success(result) =>
+          log.info("Writing successful")
+          result.getRecords.asScala.foreach { record =>
+            log.debug(s"  + ShardId: ${record.getShardId}")
+            log.debug(s"  + SequenceNumber: ${record.getSequenceNumber}")
+          }
+        case Failure(f) =>
+          log.error("Writing to Kinesis failed", f)
+      }
     }
 
   implicit class AwsKinesisClientBuilderExtensions(builder: AmazonKinesisClientBuilder) {
@@ -127,3 +133,38 @@ class KinesisSink(conf: KinesisSinkConfig) extends ISink {
       if (cond) f(builder) else builder
   }
 }
+
+object KinesisSink {
+
+  // Represents a partition key and the serialized record content
+  private type KeyedData = (String, Array[Byte])
+
+  /**
+   *  Takes a list of records and splits it into several lists, where each list is as big as
+   *  possible with respecting the record limit and the size limit.
+   */
+  def groupOutputs(recordLimit: Int, byteLimit: Int)(
+    keyedData: List[KeyedData]
+  ): List[List[KeyedData]] = {
+    case class Batch(size: Int, count: Int, keyedData: List[KeyedData])
+
+    keyedData
+      .foldLeft(List.empty[Batch]) { case (acc, (key, data)) =>
+        val recordSize = data.length + key.getBytes(UTF_8).length
+        acc match {
+          case head :: tail =>
+            if (head.count + 1 > recordLimit || head.size + recordSize > byteLimit)
+              List(Batch(recordSize, 1, List(key -> data))) ++ List(head) ++ tail
+            else
+              List(
+                Batch(head.size + recordSize, head.count + 1, (key -> data) :: head.keyedData)
+              ) ++ tail
+          case Nil =>
+            List(Batch(recordSize, 1, List(key -> data)))
+        }
+      }
+      .map(_.keyedData.reverse)
+      .reverse
+  }
+
+}
diff --git a/core/src/main/scala/com.snowplowanalytics.stream/loader/sinks/NsqSink.scala b/core/src/main/scala/com.snowplowanalytics.stream/loader/sinks/NsqSink.scala
@@ -25,6 +25,9 @@ import java.nio.charset.StandardCharsets.UTF_8
 // NSQ
 import com.snowplowanalytics.client.nsq.NSQProducer
 
+// Scala
+import scala.collection.JavaConverters._
+
 import com.snowplowanalytics.stream.loader.Config.Sink.BadSink.{Nsq => NsqSinkConfig}
 
 /**
@@ -39,10 +42,9 @@ class NsqSink(conf: NsqSinkConfig) extends ISink {
   /**
    * Writes a string to NSQ
    *
-   * @param output The string to write
-   * @param key Unused parameter which exists to implement ISink
+   * @param outputs The strings to write
    * @param good Unused parameter which exists to extend ISink
    */
-  override def store(output: String, key: Option[String], good: Boolean): Unit =
-    producer.produce(conf.streamName, output.getBytes(UTF_8))
+  override def store(outputs: List[String], good: Boolean): Unit =
+    producer.produceMulti(conf.streamName, outputs.map(_.getBytes(UTF_8)).asJava)
 }
diff --git a/core/src/main/scala/com.snowplowanalytics.stream/loader/sinks/StdouterrSink.scala b/core/src/main/scala/com.snowplowanalytics.stream/loader/sinks/StdouterrSink.scala
@@ -27,14 +27,12 @@ class StdouterrSink extends ISink {
   /**
    * Writes a string to stdout or stderr
    *
-   * @param output The string to write
-   * @param key Unused parameter which exists to implement ISink
+   * @param outputs The strings to write
    * @param good Whether to write to stdout or stderr
    */
-  def store(output: String, key: Option[String], good: Boolean) =
-    if (good) {
-      println(output) // To stdout
-    } else {
-      Console.err.println(output) // To stderr
-    }
+  def store(outputs: List[String], good: Boolean): Unit =
+    if (good)
+      outputs.foreach(println(_)) // To stdout
+    else
+      outputs.foreach(Console.err.println(_)) // To stderr
 }
diff --git a/core/src/test/scala/com.snowplowanalytics.stream.loader/ConfigSpec.scala b/core/src/test/scala/com.snowplowanalytics.stream.loader/ConfigSpec.scala
@@ -77,7 +77,13 @@ class ConfigSpec extends Specification {
             Sink.GoodSink.Elasticsearch.ESChunk(999999, 499)
           ),
           Sink.BadSink
-            .Kinesis("test-kinesis-bad-stream", Region("eu-central-1"), "127.0.0.1:7846".some)
+            .Kinesis(
+              "test-kinesis-bad-stream",
+              Region("eu-central-1"),
+              "127.0.0.1:7846".some,
+              500,
+              5242880
+            )
         ),
         Purpose.Enriched,
         Monitoring(
@@ -123,7 +129,7 @@ class ConfigSpec extends Specification {
             Sink.GoodSink.Elasticsearch.ESCluster("good", None),
             Sink.GoodSink.Elasticsearch.ESChunk(1000000, 500)
           ),
-          Sink.BadSink.Kinesis("test-kinesis-bad-stream", DefaultTestRegion, None)
+          Sink.BadSink.Kinesis("test-kinesis-bad-stream", DefaultTestRegion, None, 500, 5242880)
         ),
         Purpose.Enriched,
         Monitoring(
@@ -295,7 +301,8 @@ class ConfigSpec extends Specification {
             Sink.GoodSink.Elasticsearch.ESCluster("testindex", None),
             Sink.GoodSink.Elasticsearch.ESChunk(206, 207)
           ),
-          Sink.BadSink.Kinesis("test-kinesis-bad-stream", Region("ca-central-1"), None)
+          Sink.BadSink
+            .Kinesis("test-kinesis-bad-stream", Region("ca-central-1"), None, 500, 5242880)
         ),
         Purpose.Bad,
         Monitoring(
@@ -341,7 +348,7 @@ class ConfigSpec extends Specification {
             Sink.GoodSink.Elasticsearch.ESCluster("good", None),
             Sink.GoodSink.Elasticsearch.ESChunk(1000000, 500)
           ),
-          Sink.BadSink.Kinesis("test-kinesis-bad-stream", DefaultTestRegion, None)
+          Sink.BadSink.Kinesis("test-kinesis-bad-stream", DefaultTestRegion, None, 500, 5242880)
         ),
         Purpose.Enriched,
         Monitoring(
@@ -387,7 +394,7 @@ class ConfigSpec extends Specification {
             Sink.GoodSink.Elasticsearch.ESCluster("good", None),
             Sink.GoodSink.Elasticsearch.ESChunk(1000000, 500)
           ),
-          Sink.BadSink.Kinesis("test-kinesis-bad-stream", Region("eu-west-2"), None)
+          Sink.BadSink.Kinesis("test-kinesis-bad-stream", Region("eu-west-2"), None, 500, 5242880)
         ),
         Purpose.Enriched,
         Monitoring(
diff --git a/core/src/test/scala/com.snowplowanalytics.stream.loader/sinks/KinesisSinkSpec.scala b/core/src/test/scala/com.snowplowanalytics.stream.loader/sinks/KinesisSinkSpec.scala

Original file line number	Diff line number	Diff line change
`@@ -25,10 +25,14 @@`
`25`	`25`	`"signing": false`
`26`	`26`	`}`
`27`	`27`	`}`
	`28`	`+ "bad": {`
	`29`	`+ "recordLimit": 500`
	`30`	`+ "byteLimit": 5242880`
	`31`	`+ }`
`28`	`32`	`}`
`29`	`33`	`"monitoring": {`
`30`	`34`	`"metrics": {`
`31`	`35`	`"cloudWatch": true`
`32`	`36`	`}`
`33`	`37`	`}`
`34`		`-}`
	`38`	`+}`
Original file line number	Diff line number	Diff line change
`@@ -147,7 +147,9 @@ object Config {`
`147`	`147`	`final case class Kinesis(`
`148`	`148`	`streamName: String,`
`149`	`149`	`region: Region,`
`150`		`- customEndpoint: Option[String]`
	`150`	`+ customEndpoint: Option[String],`
	`151`	`+ recordLimit: Int,`
	`152`	`+ byteLimit: Int`
`151`	`153`	`) extends BadSink`
`152`	`154`	`}`
`153`	`155`	`}`
Original file line number	Diff line number	Diff line change
`@@ -48,10 +48,10 @@ class StdinExecutor(`
`48`	`48`	`def run = for (ln <- scala.io.Source.stdin.getLines) {`
`49`	`49`	`val (line, result) = transformer.consumeLine(ln)`
`50`	`50`	`result.bimap(`
`51`		`- f => badSink.store(createBadRow(line, f).compact, None, false),`
	`51`	`+ f => badSink.store(List(createBadRow(line, f).compact), false),`
`52`	`52`	`s =>`
`53`	`53`	`goodSink match {`
`54`		`- case Left(gs) => gs.store(s.json.toString, None, true)`
	`54`	`+ case Left(gs) => gs.store(List(s.json.toString), true)`
`55`	`55`	`case Right(sender) => sender.send(List(ln -> s.valid))`
`56`	`56`	`}`
`57`	`57`	`)`
Original file line number	Diff line number	Diff line change
`@@ -23,12 +23,12 @@ package sinks`
`23`	`23`	`* Shared interface for all sinks`
`24`	`24`	`*/`
`25`	`25`	`trait ISink {`
`26`		`- def store(output: String, key: Option[String], good: Boolean): Unit`
	`26`	`+ def store(outputs: List[String], good: Boolean): Unit`
`27`	`27`	`}`
`28`	`28`
`29`	`29`	`/**`
`30`	`30`	`* Sink which ignores all input`
`31`	`31`	`*/`
`32`	`32`	`class NullSink extends ISink {`
`33`		`- def store(output: String, key: Option[String], good: Boolean): Unit = ()`
	`33`	`+ def store(outputs: List[String], good: Boolean): Unit = ()`
`34`	`34`	`}`