apache · ifesdjeen · Jan 27, 2025 · Jan 29, 2025 · Jan 31, 2025 · Jan 31, 2025
diff --git a/.gitmodules b/.gitmodules
@@ -1,4 +1,4 @@
 [submodule "modules/accord"]
 	path = modules/accord
-	url = https://github.com/apache/cassandra-accord.git
-	branch = trunk
+	url = https://github.com/ifesdjeen/cassandra-accord.git
+	branch = CASSANDRA-20245
diff --git a/modules/accord b/modules/accord
diff --git a/src/java/org/apache/cassandra/config/AccordSpec.java b/src/java/org/apache/cassandra/config/AccordSpec.java
@@ -194,7 +194,8 @@ public enum TransactionalRangeMigration
     public boolean ephemeralReadEnabled = true;
     public boolean state_cache_listener_jfr_enabled = true;
     public final JournalSpec journal = new JournalSpec();
-    public final MinEpochRetrySpec minEpochSyncRetry = new MinEpochRetrySpec();
+    public final RetrySpec minEpochSyncRetry = new MinEpochRetrySpec();
+    public final RetrySpec fetchRetry = new FetchRetrySpec();
 
     public static class MinEpochRetrySpec extends RetrySpec
     {
@@ -204,6 +205,14 @@ public MinEpochRetrySpec()
         }
     }
 
+    public static class FetchRetrySpec extends RetrySpec
+    {
+        public FetchRetrySpec()
+        {
+            maxAttempts = new MaxAttempt(100);
+        }
+    }
+
     public static class JournalSpec implements Params
     {
         public int segmentSize = 32 << 20;

diff --git a/src/java/org/apache/cassandra/exceptions/RequestFailure.java b/src/java/org/apache/cassandra/exceptions/RequestFailure.java
@@ -43,6 +43,7 @@
 public class RequestFailure
 {
     public static final RequestFailure UNKNOWN = new RequestFailure(RequestFailureReason.UNKNOWN);
+    public static final RequestFailure UNKNOWN_TOPOLOGY = new RequestFailure(RequestFailureReason.UNKNOWN_TOPOLOGY);
     public static final RequestFailure READ_TOO_MANY_TOMBSTONES = new RequestFailure(RequestFailureReason.READ_TOO_MANY_TOMBSTONES);
     public static final RequestFailure TIMEOUT = new RequestFailure(RequestFailureReason.TIMEOUT);
     public static final RequestFailure INCOMPATIBLE_SCHEMA = new RequestFailure(RequestFailureReason.INCOMPATIBLE_SCHEMA);
@@ -134,6 +135,7 @@ public static RequestFailure forReason(RequestFailureReason reason)
         {
             default: throw new IllegalStateException("Unhandled request failure reason " + reason);
             case UNKNOWN: return UNKNOWN;
+            case UNKNOWN_TOPOLOGY: return UNKNOWN_TOPOLOGY;
             case READ_TOO_MANY_TOMBSTONES: return READ_TOO_MANY_TOMBSTONES;
             case TIMEOUT: return TIMEOUT;
             case INCOMPATIBLE_SCHEMA: return INCOMPATIBLE_SCHEMA;

diff --git a/src/java/org/apache/cassandra/exceptions/RequestFailureReason.java b/src/java/org/apache/cassandra/exceptions/RequestFailureReason.java
@@ -42,6 +42,7 @@ public enum RequestFailureReason
     READ_TOO_MANY_INDEXES                 (10),
     RETRY_ON_DIFFERENT_TRANSACTION_SYSTEM (11),
     BOOTING                               (12),
+    UNKNOWN_TOPOLOGY                      (13)
     ;
 
     static

diff --git a/src/java/org/apache/cassandra/net/MessageDelivery.java b/src/java/org/apache/cassandra/net/MessageDelivery.java
@@ -99,14 +99,6 @@ public default <REQ, RSP> Future<Message<RSP>> sendWithRetries(Backoff backoff,
         return promise;
     }
 
-    public default <REQ, RSP> Future<Message<RSP>> sendWithRetries(Verb verb, REQ request,
-                                                                   Iterator<InetAddressAndPort> candidates,
-                                                                   RetryPredicate shouldRetry,
-                                                                   RetryErrorMessage errorMessage)
-    {
-        return sendWithRetries(Backoff.NO_OP.INSTANCE, ImmediateRetryScheduler.instance, verb, request, candidates, shouldRetry, errorMessage);
-    }
-
     public default <REQ, RSP> void sendWithRetries(Backoff backoff, RetryScheduler retryThreads,
                                                    Verb verb, REQ request,
                                                    Iterator<InetAddressAndPort> candidates,
@@ -147,7 +139,8 @@ interface RetryErrorMessage
     }
 
     private static <REQ, RSP> void sendWithRetries(MessageDelivery messaging,
-                                                   Backoff backoff, RetryScheduler retryThreads,
+                                                   Backoff backoff,
+                                                   RetryScheduler retryThreads,
                                                    Verb verb, REQ request,
                                                    Iterator<InetAddressAndPort> candidates,
                                                    OnResult<RSP> onResult,

diff --git a/src/java/org/apache/cassandra/net/MessagingUtils.java b/src/java/org/apache/cassandra/net/MessagingUtils.java
@@ -18,22 +18,31 @@
 
 package org.apache.cassandra.net;
 
+import java.util.Collection;
 import java.util.Iterator;
 
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
 import org.apache.cassandra.locator.InetAddressAndPort;
 import org.apache.cassandra.repair.SharedContext;
 
 public class MessagingUtils
 {
+    private static final Logger logger = LoggerFactory.getLogger(MessagingUtils.class);
+
     /**
      * Candidate iterator that would try all endpoints known to be alive first, and then try all endpoints
      * in a round-robin manner.
+     * <p>
+     * Calls onIteration every time after exhausting the peers.
      */
-    public static Iterator<InetAddressAndPort> tryAliveFirst(SharedContext context, Iterable<InetAddressAndPort> peers)
+    public static Iterator<InetAddressAndPort> tryAliveFirst(SharedContext context, Collection<InetAddressAndPort> peers, String verb)
     {
         return new Iterator<>()
         {
             boolean firstRun = true;
+            int attempt = 0;
             Iterator<InetAddressAndPort> iter = peers.iterator();
             boolean isEmpty = !iter.hasNext();
 
@@ -58,10 +67,13 @@ public InetAddressAndPort next()
 
                 // After that, cycle through all nodes
                 if (!iter.hasNext())
+                {
+                    logger.warn("Exhausted iterator on {} cycling through the set of peers: {} attempt #{}", verb, peers, attempt++);
                     iter = peers.iterator();
+                }
 
                 return iter.next();
             }
         };
     }
-}
+}
diff --git a/src/java/org/apache/cassandra/service/accord/AccordConfigurationService.java b/src/java/org/apache/cassandra/service/accord/AccordConfigurationService.java
@@ -213,6 +213,7 @@ public EpochDiskState truncateTopologyUntil(long epoch, EpochDiskState diskState
         }
     }
 
+    //TODO (required): should not be public
     public final ChangeListener listener = new MetadataChangeListener();
     private class MetadataChangeListener implements ChangeListener
     {
@@ -267,8 +268,6 @@ public synchronized void start()
         Map<Node.Id, Long> removedNodes = mapping.removedNodes();
         for (Map.Entry<Node.Id, Long> e : removedNodes.entrySet())
             onNodeRemoved(e.getValue(), currentTopology(), e.getKey());
-
-        ClusterMetadataService.instance().log().addListener(listener);
     }
 
     @Override
@@ -416,13 +415,36 @@ void maybeReportMetadata(ClusterMetadata metadata)
         long epoch = metadata.epoch.getEpoch();
         synchronized (epochs)
         {
-            if (epochs.maxEpoch() == 0)
+            // On first boot, we have 2 options:
+            //
+            //  - we can start listening to TCM _before_ we replay topologies
+            //  - we can start listening to TCM _after_ we replay topologies
+            //
+            // If we start listening to TCM _before_ we replay topologies from other nodes,
+            // we may end up in a situation where TCM reports metadata that would create an
+            // `epoch - 1` epoch state that is not associated with any topologies, and
+            // therefore should not be listened upon.
+            //
+            // If we start listening to TCM _after_ we replay topologies, we may end up in a
+            // situation where TCM reports metadata that is 1 (or more) epochs _ahead_ of the
+            // last known epoch. Previous implementations were using TCM peer catch up, which
+            // could have resulted in gaps.
+            //
+            // Current protocol solves both problems by _first_ replaying topologies form peers,
+            // then subscribing to TCM _and_, if there are still any gaps, filling them again.
+            // However, it still has a slight chance of creating an `epoch - 1` epoch state
+            // not associated with any topologies, which under "right" circumstances could
+            // have been waited upon with `epochReady`. This check precludes creation of this
+            // epoch: by the time this code can be called, remote topology replay is already
+            // done, so TCM listener will only report epochs that are _at least_ min epoch.
+            if (epochs.maxEpoch() == 0 || epochs.minEpoch() == metadata.epoch.getEpoch())
             {
                 getOrCreateEpochState(epoch);  // touch epoch state so subsequent calls see it
                 reportMetadata(metadata);
                 return;
             }
         }
+
         getOrCreateEpochState(epoch - 1).acknowledged().addCallback(() -> reportMetadata(metadata));
     }
 
@@ -433,16 +455,25 @@ protected void fetchTopologyInternal(long epoch)
         Stage.ACCORD_MIGRATION.execute(() -> {
             if (ClusterMetadata.current().epoch.getEpoch() < epoch)
                 ClusterMetadataService.instance().fetchLogFromCMS(Epoch.create(epoch));
+
+            // In most cases, after fetching log from CMS, we will be caught up to the required epoch.
+            // This TCM will also notify Accord via reportMetadata, so we do not need to fetch topologies.
+            // If metadata has reported has skipped one or more epochs, and is _ahead_ of the requested epoch,
+            // we need to fetch topologies from peers to fill in the gap.
+            ClusterMetadata metadata = ClusterMetadata.current();
+            if (metadata.epoch.getEpoch() == epoch)
+                return;
+
             try
             {
-                Set<InetAddressAndPort> peers = new HashSet<>(ClusterMetadata.current().directory.allJoinedEndpoints());
+                Set<InetAddressAndPort> peers = new HashSet<>(metadata.directory.allJoinedEndpoints());
                 peers.remove(FBUtilities.getBroadcastAddressAndPort());
                 if (peers.isEmpty())
                     return;
-                Topology topology;
-                while ((topology = FetchTopology.fetch(SharedContext.Global.instance, peers, epoch).get()) == null)
-                {
-                }
+
+                // TODO (required): fetch only _missing_ topologies.
+                Topology topology = FetchTopology.fetch(SharedContext.Global.instance, peers, epoch).get();
+                Invariants.require(topology.epoch() == epoch);
                 reportTopology(topology);
             }
             catch (InterruptedException e)
@@ -461,6 +492,13 @@ protected void fetchTopologyInternal(long epoch)
         });
     }
 
+    @Override
+    public void reportTopology(Topology topology, boolean isLoad, boolean startSync)
+    {
+        Invariants.require(topology.epoch() <= ClusterMetadata.current().epoch.getEpoch());
+        super.reportTopology(topology, isLoad, startSync);
+    }
+
     @Override
     protected void localSyncComplete(Topology topology, boolean startSync)
     {