Skip to content

Commit

Permalink
Address Ariel's and David's comments
Browse files Browse the repository at this point in the history
  • Loading branch information
ifesdjeen committed Jan 30, 2025
1 parent 85c4d9e commit 6be92d2
Show file tree
Hide file tree
Showing 9 changed files with 96 additions and 87 deletions.
4 changes: 2 additions & 2 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
[submodule "modules/accord"]
path = modules/accord
url = https://github.com/apache/cassandra-accord.git
branch = trunk
url = https://github.com/ifesdjeen/cassandra-accord.git
branch = CASSANDRA-20245
2 changes: 1 addition & 1 deletion modules/accord
14 changes: 8 additions & 6 deletions src/java/org/apache/cassandra/config/AccordSpec.java
Original file line number Diff line number Diff line change
Expand Up @@ -194,15 +194,17 @@ public enum TransactionalRangeMigration
public boolean ephemeralReadEnabled = true;
public boolean state_cache_listener_jfr_enabled = true;
public final JournalSpec journal = new JournalSpec();
public final MinEpochRetrySpec minEpochSyncRetry = new MinEpochRetrySpec();

public static class MinEpochRetrySpec extends RetrySpec
{
public MinEpochRetrySpec()
public final RetrySpec minEpochSyncRetry = new RetrySpec() {
{
maxAttempts = new MaxAttempt(3);
}
}
};

public final RetrySpec fetchRetry = new RetrySpec() {
{
maxAttempts = new MaxAttempt(100);
}
};

public static class JournalSpec implements Params
{
Expand Down
8 changes: 0 additions & 8 deletions src/java/org/apache/cassandra/net/MessageDelivery.java
Original file line number Diff line number Diff line change
Expand Up @@ -99,14 +99,6 @@ public default <REQ, RSP> Future<Message<RSP>> sendWithRetries(Backoff backoff,
return promise;
}

public default <REQ, RSP> Future<Message<RSP>> sendWithRetries(Verb verb, REQ request,
Iterator<InetAddressAndPort> candidates,
RetryPredicate shouldRetry,
RetryErrorMessage errorMessage)
{
return sendWithRetries(Backoff.NO_OP.INSTANCE, ImmediateRetryScheduler.instance, verb, request, candidates, shouldRetry, errorMessage);
}

public default <REQ, RSP> void sendWithRetries(Backoff backoff, RetryScheduler retryThreads,
Verb verb, REQ request,
Iterator<InetAddressAndPort> candidates,
Expand Down
16 changes: 14 additions & 2 deletions src/java/org/apache/cassandra/net/MessagingUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,22 +18,31 @@

package org.apache.cassandra.net;

import java.util.Collection;
import java.util.Iterator;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.apache.cassandra.locator.InetAddressAndPort;
import org.apache.cassandra.repair.SharedContext;

public class MessagingUtils
{
private static final Logger logger = LoggerFactory.getLogger(MessagingUtils.class);

/**
* Candidate iterator that would try all endpoints known to be alive first, and then try all endpoints
* in a round-robin manner.
* <p>
* Calls onIteration every time after exhausting the peers.
*/
public static Iterator<InetAddressAndPort> tryAliveFirst(SharedContext context, Iterable<InetAddressAndPort> peers)
public static Iterator<InetAddressAndPort> tryAliveFirst(SharedContext context, Collection<InetAddressAndPort> peers, String verb)
{
return new Iterator<>()
{
boolean firstRun = true;
int attempt = 0;
Iterator<InetAddressAndPort> iter = peers.iterator();
boolean isEmpty = !iter.hasNext();

Expand All @@ -58,10 +67,13 @@ public InetAddressAndPort next()

// After that, cycle through all nodes
if (!iter.hasNext())
{
logger.warn("Exhausted iterator on {} cycling through the set of peers: {} attempt #{}", verb, peers, attempt++);
iter = peers.iterator();
}

return iter.next();
}
};
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -416,17 +416,17 @@ void maybeReportMetadata(ClusterMetadata metadata)
synchronized (epochs)
{
long maxEpoch = epochs.maxEpoch();
if (maxEpoch >= epoch)
return;

if (maxEpoch == 0)
{
getOrCreateEpochState(epoch); // touch epoch state so subsequent calls see it
reportMetadata(metadata);
return;
}
}
getOrCreateEpochState(epoch - 1).acknowledged().addCallback(() -> reportMetadata(metadata));

// Create a -1 epoch iif we know this epoch may actually exist
if (metadata.epoch.getEpoch() > minEpoch())
getOrCreateEpochState(epoch - 1).acknowledged().addCallback(() -> reportMetadata(metadata));
}

@Override
Expand All @@ -436,14 +436,25 @@ protected void fetchTopologyInternal(long epoch)
Stage.ACCORD_MIGRATION.execute(() -> {
if (ClusterMetadata.current().epoch.getEpoch() < epoch)
ClusterMetadataService.instance().fetchLogFromCMS(Epoch.create(epoch));

// In most cases, after fetching log from CMS, we will be caught up to the required epoch.
// This TCM will also notify Accord via reportMetadata, so we do not need to fetch topologies.
// If metadata has reported has skipped one or more eopchs, and is _ahead_ of the requested epoch,
// we need to fetch topologies from peers to fill in the gap.
ClusterMetadata metadata = ClusterMetadata.current();
if (metadata.epoch.getEpoch() == epoch)
return;

try
{
Set<InetAddressAndPort> peers = new HashSet<>(ClusterMetadata.current().directory.allJoinedEndpoints());
Set<InetAddressAndPort> peers = new HashSet<>(metadata.directory.allJoinedEndpoints());
peers.remove(FBUtilities.getBroadcastAddressAndPort());
if (peers.isEmpty())
return;
Topology topology;
while ((topology = FetchTopology.fetch(SharedContext.Global.instance, peers, epoch).get()) == null) {}

// TODO (required): fetch only _missing_ topologies.
Topology topology = FetchTopology.fetch(SharedContext.Global.instance, peers, epoch).get();
Invariants.require(topology.epoch() == epoch);
reportTopology(topology);
}
catch (InterruptedException e)
Expand Down
49 changes: 30 additions & 19 deletions src/java/org/apache/cassandra/service/accord/AccordService.java
Original file line number Diff line number Diff line change
Expand Up @@ -376,25 +376,39 @@ public synchronized void startup()
try
{
// Fetch topologies up to current
List<Topology> topologies = fetchTopologies(0, metadata);
List<Topology> topologies = fetchTopologies(null, metadata);
for (Topology topology : topologies)
configService.reportTopology(topology);

ClusterMetadataService.instance().log().addListener(configService.listener);
ClusterMetadata next = ClusterMetadata.current();

// if metadata was updated before we were able to add a listener, fetch remaining topologies
if (metadata != next)
if (next.epoch.isAfter(metadata.epoch))
{
topologies = fetchTopologies(metadata.epoch.getEpoch(), next);
topologies = fetchTopologies(metadata.epoch.getEpoch() + 1, next);
for (Topology topology : topologies)
configService.reportTopology(topology);
}

epochReady(metadata.epoch).get();
int attempt = 0;
int waitSeconds = 5;
while (true)
{
try
{
epochReady(metadata.epoch).get(5, SECONDS);
break;
}
catch (TimeoutException e)
{
logger.warn("Epoch {} is not ready after waiting for {} seconds", metadata.epoch, (++attempt) * waitSeconds);
}
}
}
catch (InterruptedException e)
{
Thread.currentThread().interrupt();
throw new UncheckedInterruptedException(e);
}
catch (ExecutionException e)
Expand All @@ -417,28 +431,28 @@ public synchronized void startup()
/**
* Queries peers to discover min epoch, and then fetches all topologies between min and current epochs
*/
private List<Topology> fetchTopologies(long minEpoch, ClusterMetadata metadata) throws ExecutionException, InterruptedException
private List<Topology> fetchTopologies(Long minEpoch, ClusterMetadata metadata) throws ExecutionException, InterruptedException
{
if (configService.maxEpoch() >= metadata.epoch.getEpoch())
{
logger.info("Accord epoch {} matches TCM. All topologies are known locally", metadata.epoch);
if (minEpoch != null && minEpoch == metadata.epoch.getEpoch())
return Collections.singletonList(AccordTopology.createAccordTopology(metadata));
}

Set<InetAddressAndPort> peers = new HashSet<>();
peers.addAll(metadata.directory.allAddresses());
peers.remove(FBUtilities.getBroadcastAddressAndPort());

// No peers: single node cluster or first node to boot
if (peers.isEmpty())
return Collections.singletonList(AccordTopology.createAccordTopology(metadata));;
return Collections.singletonList(AccordTopology.createAccordTopology(metadata));

// Bootstrap, fetch min epoch
if (minEpoch == 0)
if (minEpoch == null)
{
long fetched = findMinEpoch(SharedContext.Global.instance, peers);
Long fetched = findMinEpoch(SharedContext.Global.instance, peers);
if (fetched != null)
logger.info("Discovered min epoch of {} by querying {}", fetched, peers);

// No other node has advanced epoch just yet
if (fetched == 0)
if (fetched == null || fetched == metadata.epoch.getEpoch())
return Collections.singletonList(AccordTopology.createAccordTopology(metadata));

minEpoch = fetched;
Expand All @@ -454,7 +468,7 @@ private List<Topology> fetchTopologies(long minEpoch, ClusterMetadata metadata)
}

List<Future<Topology>> futures = new ArrayList<>();
logger.info("Discovered min epoch of {}. Proceeding to fetch epochs up to {}.", minEpoch, maxEpoch);
logger.info("Fetching topologies for epochs [{}, {}].", minEpoch, maxEpoch);

for (long epoch = minEpoch; epoch <= maxEpoch; epoch++)
futures.add(FetchTopology.fetch(SharedContext.Global.instance, peers, epoch));
Expand All @@ -468,14 +482,11 @@ private List<Topology> fetchTopologies(long minEpoch, ClusterMetadata metadata)
}

@VisibleForTesting
static long findMinEpoch(SharedContext context, Set<InetAddressAndPort> peers)
static Long findMinEpoch(SharedContext context, Set<InetAddressAndPort> peers)
{
try
{
Long result = FetchMinEpoch.fetch(context, peers).get();
if (result == null)
return 0L;
return result.longValue();
return FetchMinEpoch.fetch(context, peers).get();
}
catch (InterruptedException e)
{
Expand Down
42 changes: 23 additions & 19 deletions src/java/org/apache/cassandra/service/accord/FetchTopology.java
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,8 @@
import java.io.IOException;
import java.util.Collection;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import accord.topology.Topology;
import org.apache.cassandra.exceptions.RequestFailure;
import org.apache.cassandra.config.DatabaseDescriptor;
import org.apache.cassandra.io.IVersionedSerializer;
import org.apache.cassandra.io.util.DataInputPlus;
import org.apache.cassandra.io.util.DataOutputPlus;
Expand All @@ -37,11 +34,18 @@
import org.apache.cassandra.net.Verb;
import org.apache.cassandra.repair.SharedContext;
import org.apache.cassandra.service.accord.serializers.TopologySerializers;
import org.apache.cassandra.utils.Backoff;
import org.apache.cassandra.utils.concurrent.Future;

public class FetchTopology
{
private static final Logger log = LoggerFactory.getLogger(FetchTopology.class);
public String toString()
{
return "FetchTopology{" +
"epoch=" + epoch +
'}';
}

private final long epoch;

public static final IVersionedSerializer<FetchTopology> serializer = new IVersionedSerializer<>()
Expand Down Expand Up @@ -72,11 +76,6 @@ public FetchTopology(long epoch)

public static class Response
{
private static Response unkonwn(long epoch)
{
throw new IllegalStateException("Unknown topology: " + epoch);
}

// TODO (required): messaging version after version patch
public static final IVersionedSerializer<Response> serializer = new IVersionedSerializer<>()
{
Expand Down Expand Up @@ -115,20 +114,25 @@ public Response(long epoch, Topology topology)
public static final IVerbHandler<FetchTopology> handler = message -> {
long epoch = message.payload.epoch;
Topology topology = AccordService.instance().topology().maybeGlobalForEpoch(epoch);
if (topology == null)
MessagingService.instance().respond(Response.unkonwn(epoch), message);
else
if (topology != null)
MessagingService.instance().respond(new Response(epoch, topology), message);
else
throw new IllegalStateException("Unknown topology: " + epoch);
};

private static final Logger logger = LoggerFactory.getLogger(FetchTopology.class);

public static Future<Topology> fetch(SharedContext context, Collection<InetAddressAndPort> peers, long epoch)
{
FetchTopology req = new FetchTopology(epoch);
return context.messaging().<FetchTopology, Response>sendWithRetries(Verb.ACCORD_FETCH_TOPOLOGY_REQ, req,
MessagingUtils.tryAliveFirst(SharedContext.Global.instance, peers),
(attempt, from, failure) -> true,
FetchTopology request = new FetchTopology(epoch);
Backoff backoff = Backoff.fromConfig(context, DatabaseDescriptor.getAccord().fetchRetry);
return context.messaging().<FetchTopology, Response>sendWithRetries(backoff,
context.optionalTasks()::schedule,
Verb.ACCORD_FETCH_TOPOLOGY_REQ,
request,
MessagingUtils.tryAliveFirst(SharedContext.Global.instance, peers, Verb.ACCORD_FETCH_TOPOLOGY_REQ.name()),
(attempt, from, failure) -> {
System.out.println("Got " + failure + " from " + from + " while fetching " + request);
return true;
},
MessageDelivery.RetryErrorMessage.EMPTY)
.map(m -> m.payload.topology);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -234,27 +234,4 @@ private static MessageDelivery.MaxRetriesException getMaxRetriesException(Future
}
return maxRetries;
}

private static MessageDelivery.FailedResponseException getFailedResponseException(Future<Long> f) throws InterruptedException, ExecutionException
{
MessageDelivery.FailedResponseException exception;
try
{
f.get();
Assert.fail("Future should have failed");
throw new AssertionError("Unreachable");
}
catch (ExecutionException e)
{
if (e.getCause() instanceof MessageDelivery.FailedResponseException)
{
exception = (MessageDelivery.FailedResponseException) e.getCause();
}
else
{
throw e;
}
}
return exception;
}
}

0 comments on commit 6be92d2

Please sign in to comment.