diff --git a/plugins/nf-azure/src/main/nextflow/cloud/azure/batch/AzBatchService.groovy b/plugins/nf-azure/src/main/nextflow/cloud/azure/batch/AzBatchService.groovy index cd3e4b37f5..6409ba96af 100644 --- a/plugins/nf-azure/src/main/nextflow/cloud/azure/batch/AzBatchService.groovy +++ b/plugins/nf-azure/src/main/nextflow/cloud/azure/batch/AzBatchService.groovy @@ -711,77 +711,85 @@ class AzBatchService implements Closeable { } protected void createPool(AzVmPoolSpec spec) { + try { + final poolParams = new BatchPoolCreateContent(spec.poolId, spec.vmType.name) + .setVirtualMachineConfiguration(poolVmConfig(spec.opts)) + // same as the number of cores + // https://docs.microsoft.com/en-us/azure/batch/batch-parallel-node-tasks + .setTaskSlotsPerNode(spec.vmType.numberOfCores) + + final startTask = createStartTask(spec.opts.startTask) + if( startTask ) { + poolParams .setStartTask(startTask) + } - final poolParams = new BatchPoolCreateContent(spec.poolId, spec.vmType.name) - .setVirtualMachineConfiguration(poolVmConfig(spec.opts)) - // same as the number of cores - // https://docs.microsoft.com/en-us/azure/batch/batch-parallel-node-tasks - .setTaskSlotsPerNode(spec.vmType.numberOfCores) + // resource labels + if( spec.metadata ) { + final metadata = spec.metadata.collect { name, value -> + new MetadataItem(name, value) + } + poolParams.setMetadata(metadata) + } - final startTask = createStartTask(spec.opts.startTask) - if( startTask ) { - poolParams .setStartTask(startTask) - } + // virtual network + if( spec.opts.virtualNetwork ) + poolParams.setNetworkConfiguration( new NetworkConfiguration().setSubnetId(spec.opts.virtualNetwork) ) - // resource labels - if( spec.metadata ) { - final metadata = spec.metadata.collect { name, value -> - new MetadataItem(name, value) + // scheduling policy + if( spec.opts.schedulePolicy ) { + final pol = BatchNodeFillType.fromString(spec.opts.schedulePolicy) + if( !pol ) throw new IllegalArgumentException("Unknown Azure Batch scheduling policy: ${spec.opts.schedulePolicy}") + poolParams.setTaskSchedulingPolicy( new BatchTaskSchedulingPolicy(pol) ) } - poolParams.setMetadata(metadata) - } - - // virtual network - if( spec.opts.virtualNetwork ) - poolParams.setNetworkConfiguration( new NetworkConfiguration().setSubnetId(spec.opts.virtualNetwork) ) - - // scheduling policy - if( spec.opts.schedulePolicy ) { - final pol = BatchNodeFillType.fromString(spec.opts.schedulePolicy) - if( !pol ) throw new IllegalArgumentException("Unknown Azure Batch scheduling policy: ${spec.opts.schedulePolicy}") - poolParams.setTaskSchedulingPolicy( new BatchTaskSchedulingPolicy(pol) ) - } - - // mount points - if ( config.storage().fileShares ) { - List mountConfigs = new ArrayList(config.storage().fileShares.size()) - config.storage().fileShares.each { - if (it.key) { - final String accountName = config.storage().accountName - final endpoint = "https://${config.storage().accountName}.file.core.windows.net/${it.key}" as String - final accountKey = config.storage().accountKey - final shareConfig = new AzureFileShareConfiguration( accountName, endpoint, accountKey, it.key ) - .setMountOptions(it.value.mountOptions) - - mountConfigs << new MountConfiguration().setAzureFileShareConfiguration(shareConfig) - } else { - throw new IllegalArgumentException("Cannot mount a null File Share") + + // mount points + if ( config.storage().fileShares ) { + List mountConfigs = new ArrayList(config.storage().fileShares.size()) + config.storage().fileShares.each { + if (it.key) { + final String accountName = config.storage().accountName + final endpoint = "https://${config.storage().accountName}.file.core.windows.net/${it.key}" as String + final accountKey = config.storage().accountKey + final shareConfig = new AzureFileShareConfiguration( accountName, endpoint, accountKey, it.key ) + .setMountOptions(it.value.mountOptions) + + mountConfigs << new MountConfiguration().setAzureFileShareConfiguration(shareConfig) + } else { + throw new IllegalArgumentException("Cannot mount a null File Share") + } } + poolParams.setMountConfiguration(mountConfigs) } - poolParams.setMountConfiguration(mountConfigs) - } - // autoscale - if( spec.opts.autoScale ) { - log.debug "Creating autoscale pool with id: ${spec.poolId}; vmCount=${spec.opts.vmCount}; maxVmCount=${spec.opts.maxVmCount}; interval=${spec.opts.scaleInterval}" - final interval = spec.opts.scaleInterval.seconds as int - poolParams - .setEnableAutoScale(true) - .setAutoScaleEvaluationInterval( Duration.of(interval, ChronoUnit.SECONDS) ) - .setAutoScaleFormula(scaleFormula(spec.opts)) - } - else if( spec.opts.lowPriority ) { - log.debug "Creating low-priority pool with id: ${spec.poolId}; vmCount=${spec.opts.vmCount};" - poolParams - .setTargetLowPriorityNodes(spec.opts.vmCount) + // autoscale + if( spec.opts.autoScale ) { + log.debug "Creating autoscale pool with id: ${spec.poolId}; vmCount=${spec.opts.vmCount}; maxVmCount=${spec.opts.maxVmCount}; interval=${spec.opts.scaleInterval}" + final interval = spec.opts.scaleInterval.seconds as int + poolParams + .setEnableAutoScale(true) + .setAutoScaleEvaluationInterval( Duration.of(interval, ChronoUnit.SECONDS) ) + .setAutoScaleFormula(scaleFormula(spec.opts)) + } + else if( spec.opts.lowPriority ) { + log.debug "Creating low-priority pool with id: ${spec.poolId}; vmCount=${spec.opts.vmCount};" + poolParams + .setTargetLowPriorityNodes(spec.opts.vmCount) + } + else { + log.debug "Creating fixed pool with id: ${spec.poolId}; vmCount=${spec.opts.vmCount};" + poolParams + .setTargetDedicatedNodes(spec.opts.vmCount) + } + + apply(() -> client.createPool(poolParams)) } - else { - log.debug "Creating fixed pool with id: ${spec.poolId}; vmCount=${spec.opts.vmCount};" - poolParams - .setTargetDedicatedNodes(spec.opts.vmCount) + catch (HttpResponseException e) { + if (e.response.statusCode == 409 && e.response.body.toString().contains("PoolExists")) { + log.debug "Pool ${spec.poolId} already exists, ignoring creation request" + return + } + throw e } - - apply(() -> client.createPool(poolParams)) } protected String scaleFormula(AzPoolOpts opts) { @@ -923,7 +931,7 @@ class AzBatchService implements Closeable { .build() } - final private static List RETRY_CODES = List.of(408, 429, 500, 502, 503, 504) + final private static List RETRY_CODES = List.of(408, 409, 429, 500, 502, 503, 504) /** * Carry out the invocation of the specified action using a retry policy diff --git a/plugins/nf-azure/src/test/nextflow/cloud/azure/batch/AzBatchServiceTest.groovy b/plugins/nf-azure/src/test/nextflow/cloud/azure/batch/AzBatchServiceTest.groovy index eceeb644b2..ea0eb7107b 100644 --- a/plugins/nf-azure/src/test/nextflow/cloud/azure/batch/AzBatchServiceTest.groovy +++ b/plugins/nf-azure/src/test/nextflow/cloud/azure/batch/AzBatchServiceTest.groovy @@ -24,6 +24,12 @@ import nextflow.util.Duration import nextflow.util.MemoryUnit import spock.lang.Specification import spock.lang.Unroll +import com.azure.core.exception.HttpResponseException +import com.azure.core.http.HttpResponse +import reactor.core.publisher.Flux +import java.nio.ByteBuffer +import java.nio.charset.StandardCharsets +import com.azure.core.exception.ResourceExistsException /** * * @author Paolo Di Tommaso @@ -739,4 +745,59 @@ class AzBatchServiceTest extends Specification { [managedIdentity: [clientId: 'client-123']] | 'client-123' } + def 'should handle pool exists error' () { + given: + def CONFIG = [batch:[location: 'northeurope']] + def exec = Mock(AzBatchExecutor) {getConfig() >> new AzConfig(CONFIG) } + def batchImage = GroovyMock(com.azure.compute.batch.models.BatchSupportedImage) + def vmConfig = GroovyMock(com.azure.compute.batch.models.VirtualMachineConfiguration) + def client = GroovyMock(com.azure.compute.batch.BatchClient) + AzBatchService svc = Spy(new AzBatchService(exec)) { + getImage(_) >> batchImage + poolVmConfig(_) >> vmConfig + getClient() >> client + } + and: + def spec = new AzVmPoolSpec( + poolId: 'pool-1', + vmType: Mock(AzVmType) { + getName() >> 'Standard_D1_v2' + getNumberOfCores() >> 1 + }, + opts: new AzPoolOpts([:])) + and: + def errorBody = """{ + "odata.metadata":"https://mybatch.eastus.batch.azure.com/\$metadata#Microsoft.Azure.Batch.Protocol.Entities.Container.errors/@Element", + "code":"PoolExists", + "message":{ + "lang":"en-US", + "value":"The specified pool already exists.\\nRequestId:d9475bc1-f9e5-492b-9114-6a05a8a57abc\\nTime:2025-01-23T14:15:51.6526349Z" + } + }""" + def bodyBytes = errorBody.getBytes(StandardCharsets.UTF_8) + def response = GroovyMock(HttpResponse) { + getStatusCode() >> 409 + getBodyAsString() >> errorBody + getBody() >> Flux.just(ByteBuffer.wrap(bodyBytes)) + } + + when: + svc.createPool(spec) + then: + 1 * client.createPool(_) >> { throw new com.azure.core.exception.ResourceExistsException("Pool exists", response) } + noExceptionThrown() + } + + def 'should retry on specific error codes' () { + given: + def CONFIG = [batch:[location: 'northeurope']] + def exec = Mock(AzBatchExecutor) {getConfig() >> new AzConfig(CONFIG) } + AzBatchService svc = Spy(new AzBatchService(exec)) + + when: + def cond = svc.@RETRY_CODES + then: + cond.containsAll([408, 409, 429, 500, 502, 503, 504]) + } + }