From 4e19ec4bb6286163b77db2e7deeac351ca087659 Mon Sep 17 00:00:00 2001 From: Paolo Di Tommaso Date: Thu, 27 Feb 2025 23:38:22 +0100 Subject: [PATCH] Add DenyCrawlerFilter Signed-off-by: Paolo Di Tommaso --- .../controller/ServiceInfoController.groovy | 2 +- .../wave/filter/DenyCrawlerFilter.groovy | 78 +++++++++++++++++++ .../io/seqera/wave/filter/FilterOrder.groovy | 1 + .../service/data/stream/MessageStream.groovy | 2 +- .../controller/InspectControllerTest.groovy | 1 - .../wave/controller/ScanControllerTest.groovy | 3 +- .../ServiceInfoControllerTest.groovy | 50 ++++++++++-- 7 files changed, 125 insertions(+), 12 deletions(-) create mode 100644 src/main/groovy/io/seqera/wave/filter/DenyCrawlerFilter.groovy diff --git a/src/main/groovy/io/seqera/wave/controller/ServiceInfoController.groovy b/src/main/groovy/io/seqera/wave/controller/ServiceInfoController.groovy index 85d43dd4a..80ecd495f 100644 --- a/src/main/groovy/io/seqera/wave/controller/ServiceInfoController.groovy +++ b/src/main/groovy/io/seqera/wave/controller/ServiceInfoController.groovy @@ -59,7 +59,7 @@ class ServiceInfoController { : HttpResponse.badRequest() } - @Get(uri = "/openapi") + @Get("/openapi") HttpResponse getOpenAPI() { HttpResponse.redirect(URI.create("/openapi/")) } diff --git a/src/main/groovy/io/seqera/wave/filter/DenyCrawlerFilter.groovy b/src/main/groovy/io/seqera/wave/filter/DenyCrawlerFilter.groovy new file mode 100644 index 000000000..593eedd4b --- /dev/null +++ b/src/main/groovy/io/seqera/wave/filter/DenyCrawlerFilter.groovy @@ -0,0 +1,78 @@ +/* + * Wave, containers provisioning service + * Copyright (c) 2023-2024, Seqera Labs + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +package io.seqera.wave.filter + +import groovy.transform.CompileStatic +import groovy.util.logging.Slf4j +import io.micronaut.http.HttpRequest +import io.micronaut.http.HttpResponse +import io.micronaut.http.HttpStatus +import io.micronaut.http.MutableHttpResponse +import io.micronaut.http.annotation.Filter +import io.micronaut.http.filter.HttpServerFilter +import io.micronaut.http.filter.ServerFilterChain +import org.reactivestreams.Publisher +import reactor.core.publisher.Flux +/** + * Block the access to known crawler bots + * + * @author Paolo Di Tommaso + */ +@Slf4j +@CompileStatic +@Filter("/**") +class DenyCrawlerFilter implements HttpServerFilter { + + private static final List CRAWLER_AGENTS = Arrays.asList( + "googlebot", + "bingbot", + "yandexbot", + "baiduspider", + "duckduckbot", + "slurp", + "facebot", + "twitterbot", + "mj12bot", + "ahrefsbot" + ) + + static boolean isCrawler(String userAgent) { + return userAgent + ? CRAWLER_AGENTS.stream().anyMatch(userAgent::contains) + : false + } + + @Override + Publisher> doFilter(HttpRequest request, ServerFilterChain chain) { + final userAgent = request.getHeaders().get("User-Agent")?.toLowerCase() + // Check if the request path matches any of the ignored paths + if (isCrawler(userAgent)) { + // Return immediately without processing the request + log.debug("Request denied: ${request}") + return Flux.just(HttpResponse.status(HttpStatus.METHOD_NOT_ALLOWED)) + } + // Continue processing the request + return chain.proceed(request) + } + + @Override + int getOrder() { + return FilterOrder.DENY_CRAWLER + } +} diff --git a/src/main/groovy/io/seqera/wave/filter/FilterOrder.groovy b/src/main/groovy/io/seqera/wave/filter/FilterOrder.groovy index 057d4fcca..0c8919690 100644 --- a/src/main/groovy/io/seqera/wave/filter/FilterOrder.groovy +++ b/src/main/groovy/io/seqera/wave/filter/FilterOrder.groovy @@ -27,6 +27,7 @@ package io.seqera.wave.filter */ interface FilterOrder { + final int DENY_CRAWLER = -110 final int DENY_PATHS = -100 final int RATE_LIMITER = -50 final int PULL_METRICS = 10 diff --git a/src/main/groovy/io/seqera/wave/service/data/stream/MessageStream.groovy b/src/main/groovy/io/seqera/wave/service/data/stream/MessageStream.groovy index cb77741b4..5f0ae3d91 100644 --- a/src/main/groovy/io/seqera/wave/service/data/stream/MessageStream.groovy +++ b/src/main/groovy/io/seqera/wave/service/data/stream/MessageStream.groovy @@ -28,7 +28,7 @@ interface MessageStream { /** * Initialize the stream with the given Id * - * @param streamId The uniqur ID of the stream to be initialized + * @param streamId The unique ID of the stream to be initialized */ void init(String streamId) diff --git a/src/test/groovy/io/seqera/wave/controller/InspectControllerTest.groovy b/src/test/groovy/io/seqera/wave/controller/InspectControllerTest.groovy index 98788cf6c..de1324258 100644 --- a/src/test/groovy/io/seqera/wave/controller/InspectControllerTest.groovy +++ b/src/test/groovy/io/seqera/wave/controller/InspectControllerTest.groovy @@ -29,7 +29,6 @@ import io.micronaut.test.annotation.MockBean import io.micronaut.test.extensions.spock.annotation.MicronautTest import io.seqera.wave.api.ContainerInspectRequest import io.seqera.wave.api.ContainerInspectResponse -import io.seqera.wave.exception.BadRequestException import io.seqera.wave.service.logs.BuildLogService import io.seqera.wave.service.logs.BuildLogServiceImpl import jakarta.inject.Inject diff --git a/src/test/groovy/io/seqera/wave/controller/ScanControllerTest.groovy b/src/test/groovy/io/seqera/wave/controller/ScanControllerTest.groovy index 5677bf77a..013132a62 100644 --- a/src/test/groovy/io/seqera/wave/controller/ScanControllerTest.groovy +++ b/src/test/groovy/io/seqera/wave/controller/ScanControllerTest.groovy @@ -92,11 +92,10 @@ class ScanControllerTest extends Specification { res.body().requestId == scan.requestId } - def "should return 404 and null"() { when: def req = HttpRequest.GET("/v1alpha1/scans/unknown") - def res = client.toBlocking().exchange(req, WaveScanRecord) + client.toBlocking().exchange(req, WaveScanRecord) then: def e = thrown(HttpClientResponseException) diff --git a/src/test/groovy/io/seqera/wave/controller/ServiceInfoControllerTest.groovy b/src/test/groovy/io/seqera/wave/controller/ServiceInfoControllerTest.groovy index c79ef7d91..dd3c7795a 100644 --- a/src/test/groovy/io/seqera/wave/controller/ServiceInfoControllerTest.groovy +++ b/src/test/groovy/io/seqera/wave/controller/ServiceInfoControllerTest.groovy @@ -20,25 +20,61 @@ package io.seqera.wave.controller import spock.lang.Specification -import io.micronaut.http.HttpResponse +import io.micronaut.http.HttpRequest import io.micronaut.http.HttpStatus - +import io.micronaut.http.client.DefaultHttpClientConfiguration +import io.micronaut.http.client.HttpClient +import io.micronaut.http.client.annotation.Client +import io.micronaut.http.client.exceptions.HttpClientResponseException +import io.micronaut.runtime.server.EmbeddedServer +import io.micronaut.test.extensions.spock.annotation.MicronautTest +import jakarta.inject.Inject /** * * @author Munish Chouhan */ +@MicronautTest class ServiceInfoControllerTest extends Specification { + @Inject + @Client("/") + HttpClient client + + @Inject + EmbeddedServer embeddedServer; + + def 'should get service info' () { + when: + def request = HttpRequest.GET("/service-info") + def resp = client.toBlocking().exchange(request, String) + then: + resp.status.code == 200 + } + + def 'should deny service info' () { + when: + def request = HttpRequest.GET("/service-info").header('User-Agent','Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)') + client.toBlocking().exchange(request, String) + then: + def e = thrown(HttpClientResponseException) + e.status == HttpStatus.METHOD_NOT_ALLOWED + } + def 'should redirect to /openapi/'() { given: - def controller = new ServiceInfoController() - + def uri = embeddedServer.getContextURI() + and: + // Create a new HttpClient with redirects disabled + def config = new DefaultHttpClientConfiguration() + config.setFollowRedirects(false) + def client = HttpClient.create(uri.toURL(), config) when: - HttpResponse response = controller.getOpenAPI() + def request = HttpRequest.GET("/openapi") + def resp = client.toBlocking().exchange(request, String) then: - response.status == HttpStatus.MOVED_PERMANENTLY - response.header('Location') == '/openapi/' + resp.status == HttpStatus.MOVED_PERMANENTLY // Expect 301 + resp.headers.get("Location") == "/openapi/" // Validate redirect location } }