apache · ifesdjeen · Jan 27, 2025 · Jan 29, 2025 · Jan 31, 2025 · Jan 31, 2025
diff --git a/.gitmodules b/.gitmodules
@@ -1,4 +1,4 @@
 [submodule "modules/accord"]
 	path = modules/accord
-	url = https://github.com/apache/cassandra-accord.git
-	branch = trunk
+	url = https://github.com/ifesdjeen/cassandra-accord.git
+	branch = CASSANDRA-20245
diff --git a/modules/accord b/modules/accord
diff --git a/src/java/org/apache/cassandra/config/AccordSpec.java b/src/java/org/apache/cassandra/config/AccordSpec.java
@@ -194,15 +194,17 @@ public enum TransactionalRangeMigration
     public boolean ephemeralReadEnabled = true;
     public boolean state_cache_listener_jfr_enabled = true;
     public final JournalSpec journal = new JournalSpec();
-    public final MinEpochRetrySpec minEpochSyncRetry = new MinEpochRetrySpec();
-
-    public static class MinEpochRetrySpec extends RetrySpec
-    {
-        public MinEpochRetrySpec()
+    public final RetrySpec minEpochSyncRetry = new RetrySpec() {
         {
             maxAttempts = new MaxAttempt(3);
         }
-    }
+    };
+
+    public final RetrySpec fetchRetry = new RetrySpec() {
+        {
+            maxAttempts = new MaxAttempt(100);
+        }
+    };
 
     public static class JournalSpec implements Params
     {

diff --git a/src/java/org/apache/cassandra/net/MessageDelivery.java b/src/java/org/apache/cassandra/net/MessageDelivery.java
@@ -99,14 +99,6 @@ public default <REQ, RSP> Future<Message<RSP>> sendWithRetries(Backoff backoff,
         return promise;
     }
 
-    public default <REQ, RSP> Future<Message<RSP>> sendWithRetries(Verb verb, REQ request,
-                                                                   Iterator<InetAddressAndPort> candidates,
-                                                                   RetryPredicate shouldRetry,
-                                                                   RetryErrorMessage errorMessage)
-    {
-        return sendWithRetries(Backoff.NO_OP.INSTANCE, ImmediateRetryScheduler.instance, verb, request, candidates, shouldRetry, errorMessage);
-    }
-
     public default <REQ, RSP> void sendWithRetries(Backoff backoff, RetryScheduler retryThreads,
                                                    Verb verb, REQ request,
                                                    Iterator<InetAddressAndPort> candidates,

diff --git a/src/java/org/apache/cassandra/net/MessagingUtils.java b/src/java/org/apache/cassandra/net/MessagingUtils.java
@@ -18,22 +18,31 @@
 
 package org.apache.cassandra.net;
 
+import java.util.Collection;
 import java.util.Iterator;
 
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
 import org.apache.cassandra.locator.InetAddressAndPort;
 import org.apache.cassandra.repair.SharedContext;
 
 public class MessagingUtils
 {
+    private static final Logger logger = LoggerFactory.getLogger(MessagingUtils.class);
+
     /**
      * Candidate iterator that would try all endpoints known to be alive first, and then try all endpoints
      * in a round-robin manner.
+     * <p>
+     * Calls onIteration every time after exhausting the peers.
      */
-    public static Iterator<InetAddressAndPort> tryAliveFirst(SharedContext context, Iterable<InetAddressAndPort> peers)
+    public static Iterator<InetAddressAndPort> tryAliveFirst(SharedContext context, Collection<InetAddressAndPort> peers, String verb)
     {
         return new Iterator<>()
         {
             boolean firstRun = true;
+            int attempt = 0;
             Iterator<InetAddressAndPort> iter = peers.iterator();
             boolean isEmpty = !iter.hasNext();
 
@@ -58,10 +67,13 @@ public InetAddressAndPort next()
 
                 // After that, cycle through all nodes
                 if (!iter.hasNext())
+                {
+                    logger.warn("Exhausted iterator on {} cycling through the set of peers: {} attempt #{}", verb, peers, attempt++);
                     iter = peers.iterator();
+                }
 
                 return iter.next();
             }
         };
     }
-}
+}
diff --git a/src/java/org/apache/cassandra/service/accord/AccordConfigurationService.java b/src/java/org/apache/cassandra/service/accord/AccordConfigurationService.java
@@ -213,6 +213,7 @@ public EpochDiskState truncateTopologyUntil(long epoch, EpochDiskState diskState
         }
     }
 
+    // TODO: should not be public
-    // TODO: should not be public
+    //TODO (???): should not be public
-    // TODO: should not be public
+    //TODO (???): should not be public
     public final ChangeListener listener = new MetadataChangeListener();
     private class MetadataChangeListener implements ChangeListener
     {
@@ -267,8 +268,6 @@ public synchronized void start()
         Map<Node.Id, Long> removedNodes = mapping.removedNodes();
         for (Map.Entry<Node.Id, Long> e : removedNodes.entrySet())
             onNodeRemoved(e.getValue(), currentTopology(), e.getKey());
-
-        ClusterMetadataService.instance().log().addListener(listener);
     }
 
     @Override
@@ -416,14 +415,18 @@ void maybeReportMetadata(ClusterMetadata metadata)
         long epoch = metadata.epoch.getEpoch();
         synchronized (epochs)
         {
-            if (epochs.maxEpoch() == 0)
+            long maxEpoch = epochs.maxEpoch();
+            if (maxEpoch == 0)
             {
                 getOrCreateEpochState(epoch);  // touch epoch state so subsequent calls see it
                 reportMetadata(metadata);
                 return;
             }
         }
-        getOrCreateEpochState(epoch - 1).acknowledged().addCallback(() -> reportMetadata(metadata));
+
+        // Create a -1 epoch iif we know this epoch may actually exist
+        if (metadata.epoch.getEpoch() > minEpoch())
+            getOrCreateEpochState(epoch - 1).acknowledged().addCallback(() -> reportMetadata(metadata));
     }
 
     @Override
@@ -433,16 +436,25 @@ protected void fetchTopologyInternal(long epoch)
         Stage.ACCORD_MIGRATION.execute(() -> {
             if (ClusterMetadata.current().epoch.getEpoch() < epoch)
                 ClusterMetadataService.instance().fetchLogFromCMS(Epoch.create(epoch));
+
+            // In most cases, after fetching log from CMS, we will be caught up to the required epoch.
+            // This TCM will also notify Accord via reportMetadata, so we do not need to fetch topologies.
+            // If metadata has reported has skipped one or more eopchs, and is _ahead_ of the requested epoch,
-            // If metadata has reported has skipped one or more eopchs, and is _ahead_ of the requested epoch,
+            // If metadata has reported has skipped one or more epochs, and is _ahead_ of the requested epoch,
-            // If metadata has reported has skipped one or more eopchs, and is _ahead_ of the requested epoch,
+            // If metadata has reported has skipped one or more epochs, and is _ahead_ of the requested epoch,
+            // we need to fetch topologies from peers to fill in the gap.
+            ClusterMetadata metadata = ClusterMetadata.current();
+            if (metadata.epoch.getEpoch() == epoch)
+                return;
+
             try
             {
-                Set<InetAddressAndPort> peers = new HashSet<>(ClusterMetadata.current().directory.allJoinedEndpoints());
+                Set<InetAddressAndPort> peers = new HashSet<>(metadata.directory.allJoinedEndpoints());
                 peers.remove(FBUtilities.getBroadcastAddressAndPort());
                 if (peers.isEmpty())
                     return;
-                Topology topology;
-                while ((topology = FetchTopology.fetch(SharedContext.Global.instance, peers, epoch).get()) == null)
-                {
-                }
+
+                // TODO (required): fetch only _missing_ topologies.
+                Topology topology = FetchTopology.fetch(SharedContext.Global.instance, peers, epoch).get();
+                Invariants.require(topology.epoch() == epoch);
                 reportTopology(topology);
             }
             catch (InterruptedException e)
@@ -461,6 +473,13 @@ protected void fetchTopologyInternal(long epoch)
         });
     }
 
+    @Override
+    public void reportTopology(Topology topology, boolean isLoad, boolean startSync)
+    {
+        Invariants.require(topology.epoch() <= ClusterMetadata.current().epoch.getEpoch());
+        super.reportTopology(topology, isLoad, startSync);
+    }
+
     @Override
     protected void localSyncComplete(Topology topology, boolean startSync)
     {

diff --git a/src/java/org/apache/cassandra/service/accord/AccordService.java b/src/java/org/apache/cassandra/service/accord/AccordService.java
@@ -22,13 +22,10 @@
 import java.time.Duration;
 import java.util.ArrayList;
 import java.util.Arrays;
-import java.util.Collection;
 import java.util.Collections;
-import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.List;
-import java.util.Map;
 import java.util.Set;
 import java.util.concurrent.ExecutionException;
 import java.util.concurrent.TimeUnit;
@@ -44,7 +41,6 @@
 import com.google.common.annotations.VisibleForTesting;
 import com.google.common.base.Stopwatch;
 import com.google.common.base.Throwables;
-import com.google.common.collect.Sets;
 import com.google.common.primitives.Ints;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -138,7 +134,6 @@
 import org.apache.cassandra.net.MessageDelivery;
 import org.apache.cassandra.net.MessagingService;
 import org.apache.cassandra.repair.SharedContext;
-import org.apache.cassandra.schema.KeyspaceMetadata;
 import org.apache.cassandra.schema.Schema;
 import org.apache.cassandra.schema.TableId;
 import org.apache.cassandra.schema.TableMetadata;
@@ -165,7 +160,6 @@
 import org.apache.cassandra.tcm.ClusterMetadataService;
 import org.apache.cassandra.tcm.Epoch;
 import org.apache.cassandra.tcm.membership.NodeId;
-import org.apache.cassandra.tcm.ownership.DataPlacement;
 import org.apache.cassandra.tracing.Tracing;
 import org.apache.cassandra.transport.Dispatcher;
 import org.apache.cassandra.utils.Blocking;
@@ -379,25 +373,48 @@ public synchronized void startup()
         node.commandStores().restoreShardStateUnsafe(topology -> configService.reportTopology(topology, true, true));
         configService.start();
 
-        long minEpoch = fetchMinEpoch();
-        if (minEpoch >= 0)
+        try
         {
-            for (long epoch = minEpoch; epoch <= metadata.epoch.getEpoch(); epoch++)
-                node.configService().fetchTopologyForEpoch(epoch);
+            // Fetch topologies up to current
+            List<Topology> topologies = fetchTopologies(null, metadata);
+            for (Topology topology : topologies)
+                configService.reportTopology(topology);
 
-            try
-            {
-                epochReady(metadata.epoch).get(DatabaseDescriptor.getTransactionTimeout(MILLISECONDS), MILLISECONDS);
-            }
-            catch (InterruptedException e)
+            ClusterMetadataService.instance().log().addListener(configService.listener);
+            ClusterMetadata next = ClusterMetadata.current();
+
+            // if metadata was updated before we were able to add a listener, fetch remaining topologies
+            if (next.epoch.isAfter(metadata.epoch))
             {
-                throw new UncheckedInterruptedException(e);
+                topologies = fetchTopologies(metadata.epoch.getEpoch() + 1, next);
+                for (Topology topology : topologies)
+                    configService.reportTopology(topology);
             }
-            catch (ExecutionException | TimeoutException e)
+
+            int attempt = 0;
+            int waitSeconds = 5;
+            while (true)
             {
-                throw new RuntimeException(e);
+                try
+                {
+                    epochReady(metadata.epoch).get(5, SECONDS);
-                    epochReady(metadata.epoch).get(5, SECONDS);
+                    epochReady(metadata.epoch).get(waitSeconds, SECONDS);
-                    epochReady(metadata.epoch).get(5, SECONDS);
+                    epochReady(metadata.epoch).get(waitSeconds, SECONDS);
+                    break;
+                }
+                catch (TimeoutException e)
+                {
+                    logger.warn("Epoch {} is not ready after waiting for {} seconds", metadata.epoch, (++attempt) * waitSeconds);
+                }
             }
         }
+        catch (InterruptedException e)
+        {
+            Thread.currentThread().interrupt();
+            throw new UncheckedInterruptedException(e);
+        }
+        catch (ExecutionException e)
+        {
+            throw new RuntimeException(e);
+        }
 
         fastPathCoordinator.start();
         ClusterMetadataService.instance().log().addListener(fastPathCoordinator);
@@ -412,44 +429,60 @@ public synchronized void startup()
     }
 
     /**
-     * Queries peers to discover min epoch
+     * Queries peers to discover min epoch, and then fetches all topologies between min and current epochs
      */
-    private long fetchMinEpoch()
+    private List<Topology> fetchTopologies(Long minEpoch, ClusterMetadata metadata) throws ExecutionException, InterruptedException
     {
-        ClusterMetadata metadata = ClusterMetadata.current();
-        Map<InetAddressAndPort, Set<TokenRange>> peers = new HashMap<>();
-        for (KeyspaceMetadata keyspace : metadata.schema.getKeyspaces())
-        {
-            List<TableMetadata> tables = keyspace.tables.stream().filter(TableMetadata::requiresAccordSupport).collect(Collectors.toList());
-            if (tables.isEmpty())
-                continue;
-            DataPlacement current = metadata.placements.get(keyspace.params.replication);
-            DataPlacement settled = metadata.writePlacementAllSettled(keyspace);
-            Sets.SetView<InetAddressAndPort> alive = Sets.intersection(settled.writes.byEndpoint().keySet(), current.writes.byEndpoint().keySet());
-            InetAddressAndPort self = FBUtilities.getBroadcastAddressAndPort();
-            settled.writes.forEach((range, group) -> {
-                if (group.endpoints().contains(self))
-                {
-                    for (InetAddressAndPort peer : group.endpoints())
-                    {
-                        if (peer.equals(self) || !alive.contains(peer)) continue;
-                        for (TableMetadata table : tables)
-                            peers.computeIfAbsent(peer, i -> new HashSet<>()).add(AccordTopology.fullRange(table.id));
-                    }
-                }
-            });
-        }
+        if (minEpoch != null && minEpoch == metadata.epoch.getEpoch())
+            return Collections.singletonList(AccordTopology.createAccordTopology(metadata));
+
+        Set<InetAddressAndPort> peers = new HashSet<>();
+        peers.addAll(metadata.directory.allAddresses());
+        peers.remove(FBUtilities.getBroadcastAddressAndPort());
+
+        // No peers: single node cluster or first node to boot
         if (peers.isEmpty())
-            return -1;
+            return Collections.singletonList(AccordTopology.createAccordTopology(metadata));
 
-        Long minEpoch = findMinEpoch(SharedContext.Global.instance, peers);
+        // Bootstrap, fetch min epoch
         if (minEpoch == null)
-            return -1;
-        return minEpoch;
+        {
+            Long fetched = findMinEpoch(SharedContext.Global.instance, peers);
+            if (fetched != null)
+                logger.info("Discovered min epoch of {} by querying {}", fetched, peers);
+
+            // No other node has advanced epoch just yet
+            if (fetched == null || fetched == metadata.epoch.getEpoch())
+                return Collections.singletonList(AccordTopology.createAccordTopology(metadata));
+
+            minEpoch = fetched;
+        }
+
+        long maxEpoch = metadata.epoch.getEpoch();
+
+        // If we are behind minEpoch, catch up to at least minEpoch
+        if (metadata.epoch.getEpoch() < minEpoch)
+        {
+            minEpoch = metadata.epoch.getEpoch();
+            maxEpoch = minEpoch;
+        }
+
+        List<Future<Topology>> futures = new ArrayList<>();
+        logger.info("Fetching topologies for epochs [{}, {}].", minEpoch, maxEpoch);
+
+        for (long epoch = minEpoch; epoch <= maxEpoch; epoch++)
+            futures.add(FetchTopology.fetch(SharedContext.Global.instance, peers, epoch));
+
+        FBUtilities.waitOnFutures(futures);
+        List<Topology> topologies = new ArrayList<>(futures.size());
+        for (Future<Topology> future : futures)
+            topologies.add(future.get());
+
+        return topologies;
     }
 
     @VisibleForTesting
-    static Long findMinEpoch(SharedContext context, Map<InetAddressAndPort, Set<TokenRange>> peers)
+    static Long findMinEpoch(SharedContext context, Set<InetAddressAndPort> peers)
     {
         try
         {
@@ -1152,7 +1185,7 @@ private static CommandStoreTxnBlockedGraph.TxnState populate(CommandStoreTxnBloc
 
     @Nullable
     @Override
-    public Long minEpoch(Collection<TokenRange> ranges)
+    public Long minEpoch()
     {
         return node.topology().minEpoch();
     }