4
4
import copy
5
5
import itertools
6
6
import logging
7
- import random
8
7
import sys
9
8
import time
10
9
@@ -57,7 +56,6 @@ class Fetcher(six.Iterator):
57
56
'max_partition_fetch_bytes' : 1048576 ,
58
57
'max_poll_records' : sys .maxsize ,
59
58
'check_crcs' : True ,
60
- 'iterator_refetch_records' : 1 , # undocumented -- interface may change
61
59
'metric_group_prefix' : 'consumer' ,
62
60
'retry_backoff_ms' : 100 ,
63
61
'enable_incremental_fetch_sessions' : True ,
@@ -380,10 +378,13 @@ def _append(self, drained, part, max_records, update_offsets):
380
378
# as long as the partition is still assigned
381
379
position = self ._subscriptions .assignment [tp ].position
382
380
if part .next_fetch_offset == position .offset :
383
- part_records = part .take (max_records )
384
381
log .debug ("Returning fetched records at offset %d for assigned"
385
382
" partition %s" , position .offset , tp )
386
- drained [tp ].extend (part_records )
383
+ part_records = part .take (max_records )
384
+ # list.extend([]) is a noop, but because drained is a defaultdict
385
+ # we should avoid initializing the default list unless there are records
386
+ if part_records :
387
+ drained [tp ].extend (part_records )
387
388
# We want to increment subscription position if (1) we're using consumer.poll(),
388
389
# or (2) we didn't return any records (consumer iterator will update position
389
390
# when each message is yielded). There may be edge cases where we re-fetch records
@@ -562,13 +563,11 @@ def _handle_list_offsets_response(self, future, response):
562
563
def _fetchable_partitions (self ):
563
564
fetchable = self ._subscriptions .fetchable_partitions ()
564
565
# do not fetch a partition if we have a pending fetch response to process
566
+ discard = {fetch .topic_partition for fetch in self ._completed_fetches }
565
567
current = self ._next_partition_records
566
- pending = copy .copy (self ._completed_fetches )
567
568
if current :
568
- fetchable .discard (current .topic_partition )
569
- for fetch in pending :
570
- fetchable .discard (fetch .topic_partition )
571
- return fetchable
569
+ discard .add (current .topic_partition )
570
+ return [tp for tp in fetchable if tp not in discard ]
572
571
573
572
def _create_fetch_requests (self ):
574
573
"""Create fetch requests for all assigned partitions, grouped by node.
@@ -581,7 +580,7 @@ def _create_fetch_requests(self):
581
580
# create the fetch info as a dict of lists of partition info tuples
582
581
# which can be passed to FetchRequest() via .items()
583
582
version = self ._client .api_version (FetchRequest , max_version = 10 )
584
- fetchable = collections .defaultdict (dict )
583
+ fetchable = collections .defaultdict (collections . OrderedDict )
585
584
586
585
for partition in self ._fetchable_partitions ():
587
586
node_id = self ._client .cluster .leader_for_partition (partition )
@@ -695,10 +694,7 @@ def _handle_fetch_response(self, node_id, fetch_offsets, send_time, response):
695
694
for partition_data in partitions ])
696
695
metric_aggregator = FetchResponseMetricAggregator (self ._sensors , partitions )
697
696
698
- # randomized ordering should improve balance for short-lived consumers
699
- random .shuffle (response .topics )
700
697
for topic , partitions in response .topics :
701
- random .shuffle (partitions )
702
698
for partition_data in partitions :
703
699
tp = TopicPartition (topic , partition_data [0 ])
704
700
fetch_offset = fetch_offsets [tp ]
@@ -733,8 +729,6 @@ def _parse_fetched_data(self, completed_fetch):
733
729
" since it is no longer fetchable" , tp )
734
730
735
731
elif error_type is Errors .NoError :
736
- self ._subscriptions .assignment [tp ].highwater = highwater
737
-
738
732
# we are interested in this fetch only if the beginning
739
733
# offset (of the *request*) matches the current consumed position
740
734
# Note that the *response* may return a messageset that starts
@@ -748,30 +742,35 @@ def _parse_fetched_data(self, completed_fetch):
748
742
return None
749
743
750
744
records = MemoryRecords (completed_fetch .partition_data [- 1 ])
751
- if records .has_next ():
752
- log .debug ("Adding fetched record for partition %s with"
753
- " offset %d to buffered record list" , tp ,
754
- position .offset )
755
- parsed_records = self .PartitionRecords (fetch_offset , tp , records ,
756
- self .config ['key_deserializer' ],
757
- self .config ['value_deserializer' ],
758
- self .config ['check_crcs' ],
759
- completed_fetch .metric_aggregator )
760
- return parsed_records
761
- elif records .size_in_bytes () > 0 :
762
- # we did not read a single message from a non-empty
763
- # buffer because that message's size is larger than
764
- # fetch size, in this case record this exception
765
- record_too_large_partitions = {tp : fetch_offset }
766
- raise RecordTooLargeError (
767
- "There are some messages at [Partition=Offset]: %s "
768
- " whose size is larger than the fetch size %s"
769
- " and hence cannot be ever returned."
770
- " Increase the fetch size, or decrease the maximum message"
771
- " size the broker will allow." % (
772
- record_too_large_partitions ,
773
- self .config ['max_partition_fetch_bytes' ]),
774
- record_too_large_partitions )
745
+ log .debug ("Preparing to read %s bytes of data for partition %s with offset %d" ,
746
+ records .size_in_bytes (), tp , fetch_offset )
747
+ parsed_records = self .PartitionRecords (fetch_offset , tp , records ,
748
+ self .config ['key_deserializer' ],
749
+ self .config ['value_deserializer' ],
750
+ self .config ['check_crcs' ],
751
+ completed_fetch .metric_aggregator ,
752
+ self ._on_partition_records_drain )
753
+ if not records .has_next () and records .size_in_bytes () > 0 :
754
+ if completed_fetch .response_version < 3 :
755
+ # Implement the pre KIP-74 behavior of throwing a RecordTooLargeException.
756
+ record_too_large_partitions = {tp : fetch_offset }
757
+ raise RecordTooLargeError (
758
+ "There are some messages at [Partition=Offset]: %s "
759
+ " whose size is larger than the fetch size %s"
760
+ " and hence cannot be ever returned. Please condier upgrading your broker to 0.10.1.0 or"
761
+ " newer to avoid this issue. Alternatively, increase the fetch size on the client (using"
762
+ " max_partition_fetch_bytes)" % (
763
+ record_too_large_partitions ,
764
+ self .config ['max_partition_fetch_bytes' ]),
765
+ record_too_large_partitions )
766
+ else :
767
+ # This should not happen with brokers that support FetchRequest/Response V3 or higher (i.e. KIP-74)
768
+ raise Errors .KafkaError ("Failed to make progress reading messages at %s=%s."
769
+ " Received a non-empty fetch response from the server, but no"
770
+ " complete records were found." % (tp , fetch_offset ))
771
+
772
+ if highwater >= 0 :
773
+ self ._subscriptions .assignment [tp ].highwater = highwater
775
774
776
775
elif error_type in (Errors .NotLeaderForPartitionError ,
777
776
Errors .ReplicaNotAvailableError ,
@@ -805,14 +804,25 @@ def _parse_fetched_data(self, completed_fetch):
805
804
if parsed_records is None :
806
805
completed_fetch .metric_aggregator .record (tp , 0 , 0 )
807
806
808
- return None
807
+ if error_type is not Errors .NoError :
808
+ # we move the partition to the end if there was an error. This way, it's more likely that partitions for
809
+ # the same topic can remain together (allowing for more efficient serialization).
810
+ self ._subscriptions .move_partition_to_end (tp )
811
+
812
+ return parsed_records
813
+
814
+ def _on_partition_records_drain (self , partition_records ):
815
+ # we move the partition to the end if we received some bytes. This way, it's more likely that partitions
816
+ # for the same topic can remain together (allowing for more efficient serialization).
817
+ if partition_records .bytes_read > 0 :
818
+ self ._subscriptions .move_partition_to_end (partition_records .topic_partition )
809
819
810
820
def close (self ):
811
821
if self ._next_partition_records is not None :
812
822
self ._next_partition_records .drain ()
813
823
814
824
class PartitionRecords (object ):
815
- def __init__ (self , fetch_offset , tp , records , key_deserializer , value_deserializer , check_crcs , metric_aggregator ):
825
+ def __init__ (self , fetch_offset , tp , records , key_deserializer , value_deserializer , check_crcs , metric_aggregator , on_drain ):
816
826
self .fetch_offset = fetch_offset
817
827
self .topic_partition = tp
818
828
self .leader_epoch = - 1
@@ -824,6 +834,7 @@ def __init__(self, fetch_offset, tp, records, key_deserializer, value_deserializ
824
834
self .record_iterator = itertools .dropwhile (
825
835
self ._maybe_skip_record ,
826
836
self ._unpack_records (tp , records , key_deserializer , value_deserializer ))
837
+ self .on_drain = on_drain
827
838
828
839
def _maybe_skip_record (self , record ):
829
840
# When fetching an offset that is in the middle of a
@@ -845,6 +856,7 @@ def drain(self):
845
856
if self .record_iterator is not None :
846
857
self .record_iterator = None
847
858
self .metric_aggregator .record (self .topic_partition , self .bytes_read , self .records_read )
859
+ self .on_drain (self )
848
860
849
861
def take (self , n = None ):
850
862
return list (itertools .islice (self .record_iterator , 0 , n ))
@@ -943,6 +955,13 @@ def __init__(self, node_id):
943
955
self .session_partitions = {}
944
956
945
957
def build_next (self , next_partitions ):
958
+ """
959
+ Arguments:
960
+ next_partitions (dict): TopicPartition -> TopicPartitionState
961
+
962
+ Returns:
963
+ FetchRequestData
964
+ """
946
965
if self .next_metadata .is_full :
947
966
log .debug ("Built full fetch %s for node %s with %s partition(s)." ,
948
967
self .next_metadata , self .node_id , len (next_partitions ))
@@ -965,8 +984,8 @@ def build_next(self, next_partitions):
965
984
altered .add (tp )
966
985
967
986
log .debug ("Built incremental fetch %s for node %s. Added %s, altered %s, removed %s out of %s" ,
968
- self .next_metadata , self .node_id , added , altered , removed , self .session_partitions .keys ())
969
- to_send = {tp : next_partitions [tp ] for tp in (added | altered )}
987
+ self .next_metadata , self .node_id , added , altered , removed , self .session_partitions .keys ())
988
+ to_send = collections . OrderedDict ( {tp : next_partitions [tp ] for tp in next_partitions if tp in (added | altered )})
970
989
return FetchRequestData (to_send , removed , self .next_metadata )
971
990
972
991
def handle_response (self , response ):
@@ -1106,18 +1125,11 @@ def epoch(self):
1106
1125
@property
1107
1126
def to_send (self ):
1108
1127
# Return as list of [(topic, [(partition, ...), ...]), ...]
1109
- # so it an be passed directly to encoder
1128
+ # so it can be passed directly to encoder
1110
1129
partition_data = collections .defaultdict (list )
1111
1130
for tp , partition_info in six .iteritems (self ._to_send ):
1112
1131
partition_data [tp .topic ].append (partition_info )
1113
- # As of version == 3 partitions will be returned in order as
1114
- # they are requested, so to avoid starvation with
1115
- # `fetch_max_bytes` option we need this shuffle
1116
- # NOTE: we do have partition_data in random order due to usage
1117
- # of unordered structures like dicts, but that does not
1118
- # guarantee equal distribution, and starting in Python3.6
1119
- # dicts retain insert order.
1120
- return random .sample (list (partition_data .items ()), k = len (partition_data ))
1132
+ return list (partition_data .items ())
1121
1133
1122
1134
@property
1123
1135
def to_forget (self ):
0 commit comments