diff --git a/core/src/main/java/org/apache/stormcrawler/filtering/sitemap/SitemapFilter.java b/core/src/main/java/org/apache/stormcrawler/filtering/sitemap/SitemapFilter.java index 5beec2782..498b73780 100644 --- a/core/src/main/java/org/apache/stormcrawler/filtering/sitemap/SitemapFilter.java +++ b/core/src/main/java/org/apache/stormcrawler/filtering/sitemap/SitemapFilter.java @@ -16,10 +16,13 @@ */ package org.apache.stormcrawler.filtering.sitemap; +import com.fasterxml.jackson.databind.JsonNode; import java.net.URL; +import java.util.Map; import org.apache.stormcrawler.Metadata; import org.apache.stormcrawler.bolt.SiteMapParserBolt; import org.apache.stormcrawler.filtering.URLFilter; +import org.apache.stormcrawler.util.ConfUtils; import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.Nullable; @@ -35,7 +38,7 @@ * } * * - * Will be replaced by Will be replaced by MetadataFilter to filter based on * multiple key values * @@ -43,14 +46,42 @@ */ public class SitemapFilter extends URLFilter { + private static final String SITEMAP_DISCOVERY_PARAM_KEY = "sitemap.discovery"; + + private boolean sitemapsAutoDiscovery = false; + + @Override + public void configure( + @NotNull Map stormConf, + @NotNull JsonNode filtersConf, + @NotNull String name) { + super.configure(stormConf, filtersConf); + sitemapsAutoDiscovery = ConfUtils.getBoolean(stormConf, SITEMAP_DISCOVERY_PARAM_KEY, false); + } + @Override public @Nullable String filter( @Nullable URL sourceUrl, @Nullable Metadata sourceMetadata, @NotNull String urlToFilter) { + if (sourceMetadata == null) { + return urlToFilter; + } + boolean smautodisco = false; + // check in the metadata if discovery setting has been + // overridden + String localSitemapDiscoveryVal = sourceMetadata.getFirstValue(SITEMAP_DISCOVERY_PARAM_KEY); - if (sourceMetadata != null - && !Boolean.parseBoolean( + if ("true".equalsIgnoreCase(localSitemapDiscoveryVal)) { + smautodisco = true; + } else if ("false".equalsIgnoreCase(localSitemapDiscoveryVal)) { + smautodisco = false; + } else { + smautodisco = sitemapsAutoDiscovery; + } + if (!smautodisco) { + return urlToFilter; + } else if (!Boolean.parseBoolean( sourceMetadata.getFirstValue(SiteMapParserBolt.isSitemapKey)) && Boolean.parseBoolean( sourceMetadata.getFirstValue(SiteMapParserBolt.foundSitemapKey))) {