Merge pull request dpkp#111 from rdiomar/multitopic_producers

dpkp · dpkp · commit 4abf7ee1fbbd · 2014-01-30T17:00:55.000-08:00
Make producers take a topic argument at send rather than init time -- fixes Issue dpkp#110, but breaks backwards compatibility with previous Producer interface.
diff --git a/README.md b/README.md
@@ -17,7 +17,7 @@ Copyright 2013, David Arthur under Apache License, v2.0. See `LICENSE`
 
 # Status
 
-The current version of this package is **0.9.0** and is compatible with 
+The current version of this package is **0.9.0** and is compatible with
 Kafka brokers running version **0.8.1**.
 
 # Usage
@@ -32,24 +32,24 @@ from kafka.producer import SimpleProducer, KeyedProducer
 kafka = KafkaClient("localhost", 9092)
 
 # To send messages synchronously
-producer = SimpleProducer(kafka, "my-topic")
-producer.send_messages("some message")
-producer.send_messages("this method", "is variadic")
+producer = SimpleProducer(kafka)
+producer.send_messages("my-topic", "some message")
+producer.send_messages("my-topic", "this method", "is variadic")
 
 # To send messages asynchronously
-producer = SimpleProducer(kafka, "my-topic", async=True)
-producer.send_messages("async message")
+producer = SimpleProducer(kafka, async=True)
+producer.send_messages("my-topic", "async message")
 
 # To wait for acknowledgements
 # ACK_AFTER_LOCAL_WRITE : server will wait till the data is written to
 #                         a local log before sending response
 # ACK_AFTER_CLUSTER_COMMIT : server will block until the message is committed
 #                            by all in sync replicas before sending a response
-producer = SimpleProducer(kafka, "my-topic", async=False,
+producer = SimpleProducer(kafka, async=False,
                           req_acks=SimpleProducer.ACK_AFTER_LOCAL_WRITE,
                           ack_timeout=2000)
 
-response = producer.send_messages("async message")
+response = producer.send_messages("my-topic", "async message")
 
 if response:
     print(response[0].error)
@@ -62,7 +62,7 @@ if response:
 # Notes:
 # * If the producer dies before the messages are sent, there will be losses
 # * Call producer.stop() to send the messages and cleanup
-producer = SimpleProducer(kafka, "my-topic", batch_send=True,
+producer = SimpleProducer(kafka, batch_send=True,
                           batch_send_every_n=20,
                           batch_send_every_t=60)
 
@@ -83,11 +83,11 @@ from kafka.partitioner import HashedPartitioner, RoundRobinPartitioner
 kafka = KafkaClient("localhost", 9092)
 
 # HashedPartitioner is default
-producer = KeyedProducer(kafka, "my-topic")
-producer.send("key1", "some message")
-producer.send("key2", "this methode")
+producer = KeyedProducer(kafka)
+producer.send("my-topic", "key1", "some message")
+producer.send("my-topic", "key2", "this methode")
 
-producer = KeyedProducer(kafka, "my-topic", partitioner=RoundRobinPartitioner)
+producer = KeyedProducer(kafka, partitioner=RoundRobinPartitioner)
 ```
 
 ## Multiprocess consumer
diff --git a/kafka/producer.py b/kafka/producer.py
@@ -8,7 +8,7 @@
 from itertools import cycle
 from multiprocessing import Queue, Process
 
-from kafka.common import ProduceRequest
+from kafka.common import ProduceRequest, TopicAndPartition
 from kafka.partitioner import HashedPartitioner
 from kafka.protocol import create_message
 
@@ -20,7 +20,7 @@
 STOP_ASYNC_PRODUCER = -1
 
 
-def _send_upstream(topic, queue, client, batch_time, batch_size,
+def _send_upstream(queue, client, batch_time, batch_size,
                    req_acks, ack_timeout):
     """
     Listen on the queue for a specified number of messages or till
@@ -44,24 +44,27 @@ def _send_upstream(topic, queue, client, batch_time, batch_size,
         # timeout is reached
         while count > 0 and timeout >= 0:
             try:
-                partition, msg = queue.get(timeout=timeout)
+                topic_partition, msg = queue.get(timeout=timeout)
+
             except Empty:
                 break
 
             # Check if the controller has requested us to stop
-            if partition == STOP_ASYNC_PRODUCER:
+            if topic_partition == STOP_ASYNC_PRODUCER:
                 stop = True
                 break
 
             # Adjust the timeout to match the remaining period
             count -= 1
             timeout = send_at - time.time()
-            msgset[partition].append(msg)
+            msgset[topic_partition].append(msg)
 
         # Send collected requests upstream
         reqs = []
-        for partition, messages in msgset.items():
-            req = ProduceRequest(topic, partition, messages)
+        for topic_partition, messages in msgset.items():
+            req = ProduceRequest(topic_partition.topic,
+                                 topic_partition.partition,
+                                 messages)
             reqs.append(req)
 
         try:
@@ -78,7 +81,6 @@ class Producer(object):
 
     Params:
     client - The Kafka client instance to use
-    topic - The topic for sending messages to
     async - If set to true, the messages are sent asynchronously via another
             thread (process). We will not wait for a response to these
     req_acks - A value indicating the acknowledgements that the server must
@@ -119,8 +121,7 @@ def __init__(self, client, async=False,
         if self.async:
             self.queue = Queue()  # Messages are sent through this queue
             self.proc = Process(target=_send_upstream,
-                                args=(self.topic,
-                                      self.queue,
+                                args=(self.queue,
                                       self.client.copy(),
                                       batch_send_every_t,
                                       batch_send_every_n,
@@ -131,17 +132,18 @@ def __init__(self, client, async=False,
             self.proc.daemon = True
             self.proc.start()
 
-    def send_messages(self, partition, *msg):
+    def send_messages(self, topic, partition, *msg):
         """
         Helper method to send produce requests
         """
         if self.async:
             for m in msg:
-                self.queue.put((partition, create_message(m)))
+                self.queue.put((TopicAndPartition(topic, partition),
+                                create_message(m)))
             resp = []
         else:
             messages = [create_message(m) for m in msg]
-            req = ProduceRequest(self.topic, partition, messages)
+            req = ProduceRequest(topic, partition, messages)
             try:
                 resp = self.client.send_produce_request([req], acks=self.req_acks,
                                                         timeout=self.ack_timeout)
@@ -169,7 +171,6 @@ class SimpleProducer(Producer):
 
     Params:
     client - The Kafka client instance to use
-    topic - The topic for sending messages to
     async - If True, the messages are sent asynchronously via another
             thread (process). We will not wait for a response to these
     req_acks - A value indicating the acknowledgements that the server must
@@ -180,27 +181,31 @@ class SimpleProducer(Producer):
     batch_send_every_n - If set, messages are send in batches of this size
     batch_send_every_t - If set, messages are send after this timeout
     """
-    def __init__(self, client, topic, async=False,
+    def __init__(self, client, async=False,
                  req_acks=Producer.ACK_AFTER_LOCAL_WRITE,
                  ack_timeout=Producer.DEFAULT_ACK_TIMEOUT,
                  batch_send=False,
                  batch_send_every_n=BATCH_SEND_MSG_COUNT,
                  batch_send_every_t=BATCH_SEND_DEFAULT_INTERVAL):
-        self.topic = topic
-        client.load_metadata_for_topics(topic)
-        self.next_partition = cycle(client.topic_partitions[topic])
-
+        self.partition_cycles = {}
         super(SimpleProducer, self).__init__(client, async, req_acks,
                                              ack_timeout, batch_send,
                                              batch_send_every_n,
                                              batch_send_every_t)
 
-    def send_messages(self, *msg):
-        partition = self.next_partition.next()
-        return super(SimpleProducer, self).send_messages(partition, *msg)
+    def _next_partition(self, topic):
+        if topic not in self.partition_cycles:
+            if topic not in self.client.topic_partitions:
+                self.client.load_metadata_for_topics(topic)
+            self.partition_cycles[topic] = cycle(self.client.topic_partitions[topic])
+        return self.partition_cycles[topic].next()
+
+    def send_messages(self, topic, *msg):
+        partition = self._next_partition(topic)
+        return super(SimpleProducer, self).send_messages(topic, partition, *msg)
 
     def __repr__(self):
-        return '<SimpleProducer topic=%s, batch=%s>' % (self.topic, self.async)
+        return '<SimpleProducer batch=%s>' % self.async
 
 
 class KeyedProducer(Producer):
@@ -209,7 +214,6 @@ class KeyedProducer(Producer):
 
     Args:
     client - The kafka client instance
-    topic - The kafka topic to send messages to
     partitioner - A partitioner class that will be used to get the partition
         to send the message to. Must be derived from Partitioner
     async - If True, the messages are sent asynchronously via another
@@ -220,29 +224,34 @@ class KeyedProducer(Producer):
     batch_send_every_n - If set, messages are send in batches of this size
     batch_send_every_t - If set, messages are send after this timeout
     """
-    def __init__(self, client, topic, partitioner=None, async=False,
+    def __init__(self, client, partitioner=None, async=False,
                  req_acks=Producer.ACK_AFTER_LOCAL_WRITE,
                  ack_timeout=Producer.DEFAULT_ACK_TIMEOUT,
                  batch_send=False,
                  batch_send_every_n=BATCH_SEND_MSG_COUNT,
                  batch_send_every_t=BATCH_SEND_DEFAULT_INTERVAL):
-        self.topic = topic
-        client.load_metadata_for_topics(topic)
-
         if not partitioner:
             partitioner = HashedPartitioner
-
-        self.partitioner = partitioner(client.topic_partitions[topic])
+        self.partitioner_class = partitioner
+        self.partitioners = {}
 
         super(KeyedProducer, self).__init__(client, async, req_acks,
                                             ack_timeout, batch_send,
                                             batch_send_every_n,
                                             batch_send_every_t)
 
-    def send(self, key, msg):
-        partitions = self.client.topic_partitions[self.topic]
-        partition = self.partitioner.partition(key, partitions)
-        return self.send_messages(partition, msg)
+    def _next_partition(self, topic, key):
+        if topic not in self.partitioners:
+            if topic not in self.client.topic_partitions:
+                self.client.load_metadata_for_topics(topic)
+            self.partitioners[topic] = \
+                self.partitioner_class(self.client.topic_partitions[topic])
+        partitioner = self.partitioners[topic]
+        return partitioner.partition(key, self.client.topic_partitions[topic])
+
+    def send(self, topic, key, msg):
+        partition = self._next_partition(topic, key)
+        return self.send_messages(topic, partition, msg)
 
     def __repr__(self):
-        return '<KeyedProducer topic=%s, batch=%s>' % (self.topic, self.async)
+        return '<KeyedProducer batch=%s>' % self.async
diff --git a/test/test_integration.py b/test/test_integration.py