@@ -28,6 +28,7 @@ import org.slf4j.LoggerFactory
28
28
29
29
// Scala
30
30
import scala .util .{Failure , Random , Success , Try }
31
+ import scala .collection .JavaConverters ._
31
32
32
33
// Amazon
33
34
import com .amazonaws .services .kinesis .model ._
@@ -43,6 +44,7 @@ import com.snowplowanalytics.stream.loader.Config.Sink.BadSink.{Kinesis => Kines
43
44
* @param conf Config for Kinesis sink
44
45
*/
45
46
class KinesisSink (conf : KinesisSinkConfig ) extends ISink {
47
+ import KinesisSink ._
46
48
47
49
private lazy val log = LoggerFactory .getLogger(getClass)
48
50
@@ -78,40 +80,44 @@ class KinesisSink(conf: KinesisSinkConfig) extends ISink {
78
80
case rnfe : ResourceNotFoundException => false
79
81
}
80
82
81
- private def put (name : String , data : ByteBuffer , key : String ): PutRecordResult = {
82
- val putRecordRequest = {
83
- val p = new PutRecordRequest ()
84
- p.setStreamName(name)
85
- p.setData(data)
86
- p.setPartitionKey(key)
87
- p
83
+ private def put (name : String , keyedData : List [KeyedData ]): PutRecordsResult = {
84
+ val prres = keyedData.map { case (key, data) =>
85
+ new PutRecordsRequestEntry ()
86
+ .withPartitionKey(key)
87
+ .withData(ByteBuffer .wrap(data))
88
88
}
89
- client.putRecord(putRecordRequest)
89
+ val putRecordsRequest =
90
+ new PutRecordsRequest ()
91
+ .withStreamName(name)
92
+ .withRecords(prres.asJava)
93
+ client.putRecords(putRecordsRequest)
90
94
}
91
95
92
96
/**
93
- * Write a record to the Kinesis stream
97
+ * Write records to the Kinesis stream
94
98
*
95
- * @param output The string record to write
96
- * @param key A hash of the key determines to which shard the
97
- * record is assigned. Defaults to a random string.
99
+ * @param outputs The string records to write
98
100
* @param good Unused parameter which exists to extend ISink
99
101
*/
100
- def store (output : String , key : Option [String ], good : Boolean ): Unit =
101
- Try {
102
- put(
103
- conf.streamName,
104
- ByteBuffer .wrap(output.getBytes(UTF_8 )),
105
- key.getOrElse(Random .nextInt.toString)
106
- )
107
- } match {
108
- case Success (result) =>
109
- log.info(" Writing successful" )
110
- log.info(s " + ShardId: ${result.getShardId}" )
111
- log.info(s " + SequenceNumber: ${result.getSequenceNumber}" )
112
- case Failure (f) =>
113
- log.error(" Writing failed" )
114
- log.error(" + " + f.getMessage)
102
+ def store (outputs : List [String ], good : Boolean ): Unit =
103
+ groupOutputs(conf.recordLimit, conf.byteLimit) {
104
+ outputs.map(s => Random .nextInt.toString -> s.getBytes(UTF_8 ))
105
+ }.foreach { keyedData =>
106
+ Try {
107
+ put(
108
+ conf.streamName,
109
+ keyedData
110
+ )
111
+ } match {
112
+ case Success (result) =>
113
+ log.info(" Writing successful" )
114
+ result.getRecords.asScala.foreach { record =>
115
+ log.debug(s " + ShardId: ${record.getShardId}" )
116
+ log.debug(s " + SequenceNumber: ${record.getSequenceNumber}" )
117
+ }
118
+ case Failure (f) =>
119
+ log.error(" Writing to Kinesis failed" , f)
120
+ }
115
121
}
116
122
117
123
implicit class AwsKinesisClientBuilderExtensions (builder : AmazonKinesisClientBuilder ) {
@@ -127,3 +133,38 @@ class KinesisSink(conf: KinesisSinkConfig) extends ISink {
127
133
if (cond) f(builder) else builder
128
134
}
129
135
}
136
+
137
+ object KinesisSink {
138
+
139
+ // Represents a partition key and the serialized record content
140
+ private type KeyedData = (String , Array [Byte ])
141
+
142
+ /**
143
+ * Takes a list of records and splits it into several lists, where each list is as big as
144
+ * possible with respecting the record limit and the size limit.
145
+ */
146
+ def groupOutputs (recordLimit : Int , byteLimit : Int )(
147
+ keyedData : List [KeyedData ]
148
+ ): List [List [KeyedData ]] = {
149
+ case class Batch (size : Int , count : Int , keyedData : List [KeyedData ])
150
+
151
+ keyedData
152
+ .foldLeft(List .empty[Batch ]) { case (acc, (key, data)) =>
153
+ val recordSize = data.length + key.getBytes(UTF_8 ).length
154
+ acc match {
155
+ case head :: tail =>
156
+ if (head.count + 1 > recordLimit || head.size + recordSize > byteLimit)
157
+ List (Batch (recordSize, 1 , List (key -> data))) ++ List (head) ++ tail
158
+ else
159
+ List (
160
+ Batch (head.size + recordSize, head.count + 1 , (key -> data) :: head.keyedData)
161
+ ) ++ tail
162
+ case Nil =>
163
+ List (Batch (recordSize, 1 , List (key -> data)))
164
+ }
165
+ }
166
+ .map(_.keyedData.reverse)
167
+ .reverse
168
+ }
169
+
170
+ }
0 commit comments