Merge pull request #262 from kris-sigur/sitemaps

anjackson · web-flow · commit d7869de8d978 · 2021-05-20T21:05:13.000+01:00
Support for extracting URLs in sitemaps
diff --git a/commons/pom.xml b/commons/pom.xml
@@ -57,7 +57,7 @@
 		<dependency>
 			<groupId>commons-io</groupId>
 			<artifactId>commons-io</artifactId>
-			<version>1.4</version>
+			<version>2.4</version>
 			<scope>compile</scope>
 		</dependency>
 		<dependency>
diff --git a/engine/src/main/resources/org/archive/crawler/restlet/profile-crawler-beans.cxml b/engine/src/main/resources/org/archive/crawler/restlet/profile-crawler-beans.cxml
@@ -288,6 +288,11 @@ http://example.example/example
  </bean>
  <bean id="extractorHttp" class="org.archive.modules.extractor.ExtractorHTTP">
  </bean>
+ <bean id="extractorRobotsTxt" class="org.archive.modules.extractor.ExtractorRobotsTxt">
+ </bean>
+ <bean id="extractorSitemap" class="org.archive.modules.extractor.ExtractorSitemap">
+ </bean>
+ 
  <bean id="extractorHtml" class="org.archive.modules.extractor.ExtractorHTML">
   <!-- <property name="extractJavascript" value="true" /> -->
   <!-- <property name="extractValueAttributes" value="true" /> -->
@@ -320,6 +325,10 @@ http://example.example/example
     <ref bean="fetchHttp"/>
     <!-- ...extract outlinks from HTTP headers... -->
     <ref bean="extractorHttp"/>
+    <!-- ...extract sitemap urls from robots.txt... -->
+    <ref bean="extractorRobotsTxt"/>
+    <!-- ...extract links from sitemaps... -->
+    <ref bean="extractorSitemap"/>
     <!-- ...extract outlinks from HTML content... -->
     <ref bean="extractorHtml"/>
     <!-- ...extract outlinks from CSS content... -->
diff --git a/modules/pom.xml b/modules/pom.xml
@@ -62,6 +62,11 @@
 			<version>1.6.6</version>
 			<scope>test</scope>
 		</dependency>
+    <dependency>
+      <groupId>com.github.crawler-commons</groupId>
+      <artifactId>crawler-commons</artifactId>
+      <version>1.0</version>
+    </dependency>
 		<dependency>
 			<groupId>com.jcraft</groupId>
 			<artifactId>jsch</artifactId>
diff --git a/modules/src/main/java/org/archive/modules/extractor/ExtractorRobotsTxt.java b/modules/src/main/java/org/archive/modules/extractor/ExtractorRobotsTxt.java
@@ -0,0 +1,114 @@
+package org.archive.modules.extractor;
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.commons.httpclient.URIException;
+import org.archive.modules.CrawlURI;
+
+public class ExtractorRobotsTxt extends ContentExtractor {
+    private static final Logger LOGGER = Logger
+            .getLogger(ExtractorRobotsTxt.class.getName());
+    private static final Pattern ROBOTS_PATTERN = Pattern
+            .compile("^https?://[^/]+/robots.txt$");
+    private static final Pattern SITEMAP_PATTERN = Pattern
+            .compile("(?i)Sitemap:\\s*(.+)$");
+
+    public static final String ANNOTATION_IS_SITEMAP = "isSitemap";
+
+    @Override
+    protected boolean shouldExtract(CrawlURI uri) {
+    	boolean shouldExtract = false;
+    	if (uri.isPrerequisite()) {
+    		shouldExtract = ROBOTS_PATTERN.matcher(uri.getURI()).matches();
+            LOGGER.finest("Checked prerequisite " + uri + " GOT " + shouldExtract);
+    	}
+        return shouldExtract;
+    }
+
+    public List<String> parseRobotsTxt(InputStream input) {
+        ArrayList<String> links = new ArrayList<>();
+        BufferedReader reader = new BufferedReader(new InputStreamReader(input));
+        try {
+            String line;
+            Matcher matcher;
+            while ((line = reader.readLine()) != null) {
+                matcher = SITEMAP_PATTERN.matcher(line);
+                if (matcher.matches()) {
+                    links.add(matcher.group(1));
+                }
+            }
+        } catch (IOException e) {
+            LOGGER.warning(e.toString());
+        }
+        return links;
+    }
+
+    @Override
+    protected boolean innerExtract(CrawlURI curi) {
+        try {
+
+            // Clone the CrawlURI and change hop path and avoid queueing
+            // sitemaps as prerequisites (i.e. strip P from hop path).
+            CrawlURI curiClone = new CrawlURI(curi.getUURI(),
+                    curi.getPathFromSeed().replace("P", ""), curi.getVia(),
+                    curi.getViaContext());
+
+            // Also copy the source over:
+            if (curi.getSourceTag() != null) {
+                curiClone.setSourceTag(curi.getSourceTag());
+            }
+            
+            // Parse the robots for the sitemaps.
+            List<String> links = parseRobotsTxt(
+                    curi.getRecorder()
+                    .getContentReplayInputStream());
+            LOGGER.finest("Checked " + curi + " GOT " + links);
+
+            // Get the max outlinks (needed by add method):
+            int max = getExtractorParameters().getMaxOutlinks();
+
+            // Accrue links:
+            for (String link : links) {
+                try {
+                    // We've found a sitemap:
+                    LOGGER.fine("Found site map: " + link);
+                    numberOfLinksExtracted.incrementAndGet();
+
+                    // Add links but using the cloned CrawlURI as the crawl
+                    // context.
+                    CrawlURI newCuri = addRelativeToBase(curiClone, max, link,
+                            LinkContext.MANIFEST_MISC, Hop.MANIFEST);
+
+                    // Annotate as a Site Map:
+                    newCuri.getAnnotations().add(
+                            ExtractorRobotsTxt.ANNOTATION_IS_SITEMAP);
+
+                } catch (URIException e) {
+                    logUriError(e, curi.getUURI(), link);
+                }
+            }
+
+            // Patch outlinks back into original curi:
+            for (CrawlURI outlink : curiClone.getOutLinks()) {
+                curi.getOutLinks().add(outlink);
+            }
+
+            // Return number of links discovered:
+            return !links.isEmpty();
+
+        } catch (IOException e) {
+            LOGGER.log(Level.WARNING, curi.getURI(), e);
+            curi.getNonFatalFailures().add(e);
+        }
+        return false;
+    }
+
+}
diff --git a/modules/src/main/java/org/archive/modules/extractor/ExtractorSitemap.java b/modules/src/main/java/org/archive/modules/extractor/ExtractorSitemap.java
@@ -0,0 +1,178 @@
+package org.archive.modules.extractor;
+
+import java.io.IOException;
+import java.net.URL;
+import java.util.Collection;
+import java.util.Date;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+
+import org.apache.commons.httpclient.URIException;
+import org.apache.commons.io.IOUtils;
+import org.archive.modules.CrawlURI;
+import org.archive.modules.extractor.ContentExtractor;
+import org.archive.modules.extractor.Hop;
+import org.archive.modules.extractor.LinkContext;
+
+import crawlercommons.sitemaps.AbstractSiteMap;
+import crawlercommons.sitemaps.SiteMap;
+import crawlercommons.sitemaps.SiteMapIndex;
+import crawlercommons.sitemaps.SiteMapParser;
+import crawlercommons.sitemaps.SiteMapURL;
+import crawlercommons.sitemaps.UnknownFormatException;
+
+/**
+ * 
+ * @author Andrew Jackson <Andrew.Jackson@bl.uk>
+ *
+ */
+public class ExtractorSitemap extends ContentExtractor {
+    private static final Logger LOGGER = Logger
+            .getLogger(ExtractorSitemap.class.getName());
+
+    /* (non-Javadoc)
+     * @see org.archive.modules.extractor.ContentExtractor#shouldExtract(org.archive.modules.CrawlURI)
+     */
+    @Override
+    protected boolean shouldExtract(CrawlURI uri) {
+        // If declared as such:
+        if (uri.getAnnotations()
+                .contains(ExtractorRobotsTxt.ANNOTATION_IS_SITEMAP)) {
+            if (uri.is2XXSuccess()) {
+                LOGGER.fine("This url (" + uri
+                        + ") is declared to be a sitemap (via robots.txt) and is a HTTP 200.");
+                return true;
+            } else {
+                LOGGER.fine("This url (" + uri
+                        + ") is declared to be a sitemap (via robots.txt) but is a HTTP "
+                        + uri.getFetchStatus() + ".");
+            }
+        }
+
+        // Via content type:
+        String mimeType = uri.getContentType();
+        if (mimeType != null ) {
+            // Looks like XML:
+            if (mimeType.toLowerCase().startsWith("text/xml")
+                    || mimeType.toLowerCase().startsWith("application/xml")) {
+
+                // check if content starts with xml preamble "<?xml" and does
+                // contain "<urlset " or "<sitemapindex" early in the content
+                String contentStartingChunk = uri.getRecorder()
+                        .getContentReplayPrefixString(400);
+                if (contentStartingChunk.matches("(?is)[\\ufeff]?<\\?xml\\s.*")
+                        && contentStartingChunk.matches(
+                                "(?is).*(?:<urlset|<sitemapindex[>\\s]).*")) {
+                    LOGGER.info("Based on content sniffing, this is a sitemap: "
+                            + uri);
+                    return true;
+                }
+            }
+        }
+        
+        // Otherwise, not
+        return false;
+    }
+
+    /* (non-Javadoc)
+     * @see org.archive.modules.extractor.ContentExtractor#innerExtract(org.archive.modules.CrawlURI)
+     */
+    @Override
+    protected boolean innerExtract(CrawlURI uri) {
+        // Parse the sitemap:
+        AbstractSiteMap sitemap = parseSiteMap(uri);
+
+        // Did that work?
+        if (sitemap != null) {
+            // Process results:
+            if (sitemap.isIndex()) {
+                final Collection<AbstractSiteMap> links = ((SiteMapIndex) sitemap)
+                        .getSitemaps();
+                for (final AbstractSiteMap asm : links) {
+                    if (asm == null) {
+                        continue;
+                    }
+                    this.recordOutlink(uri, asm.getUrl(), asm.getLastModified(),
+                            true);
+                }
+            } else {
+                final Collection<SiteMapURL> links = ((SiteMap) sitemap)
+                        .getSiteMapUrls();
+                for (final SiteMapURL url : links) {
+                    if (url == null) {
+                        continue;
+                    }
+                    this.recordOutlink(uri, url.getUrl(), url.getLastModified(),
+                            false);
+                }
+            }
+        }
+
+        return false;
+    }
+
+    /**
+     * Parse the sitemap using the Crawler Commons content-sniffing parser.
+     * 
+     * @param uri
+     * @return
+     */
+    private AbstractSiteMap parseSiteMap(CrawlURI uri) {
+        // The thing we will create:
+        AbstractSiteMap sitemap = null;
+
+        // Be strict about URLs but allow partial extraction:
+        SiteMapParser smp = new SiteMapParser(true, true);
+        // Parse it up:
+        try {
+            // Sitemaps are not supposed to be bigger than 50MB (according to
+            // Google) so if we hit problems we can implement that limit:
+            byte[] content = IOUtils.toByteArray(
+                    uri.getRecorder().getContentReplayInputStream());
+            if (content.length > 52428800) {
+                LOGGER.warning("Found sitemap exceeding 50MB " + uri + " "
+                        + content.length);
+            }
+            // Now we can process it:
+            sitemap = smp.parseSiteMap(content, new URL(uri.getURI()));
+        } catch (IOException e) {
+            LOGGER.log(Level.WARNING,
+                    "I/O Exception when parsing sitemap " + uri, e);
+        } catch (UnknownFormatException e) {
+            LOGGER.log(Level.WARNING,
+                    "UnknownFormatException when parsing sitemap " + uri, e);
+        }
+        return sitemap;
+    }
+
+    private void recordOutlink(CrawlURI curi, URL newUri, Date lastModified,
+            boolean isSitemap) {
+        try {
+            // Get the max outlinks (needed by add method):
+            //
+            // Because sitemaps are really important we excuse this extractor
+            // from the general setting:
+            //
+            // getExtractorParameters().getMaxOutlinks();
+            //
+            // And instead use the maximum that is allowed for a sitemap:
+            int max = 50000;
+
+            // Add the URI:
+        	// Adding 'regular' URL listed in the sitemap
+            addRelativeToBase(curi, max, newUri.toString(),
+                    LinkContext.MANIFEST_MISC, Hop.MANIFEST);
+
+            // And log about it:
+            LOGGER.fine("Found " + newUri + " from " + curi + " Dated "
+                    + lastModified + " and with isSitemap = " + isSitemap);
+            // Count it:
+            numberOfLinksExtracted.incrementAndGet();
+        } catch (URIException e) {
+            LOGGER.log(Level.WARNING,
+                    "URIException when recording outlink " + newUri, e);
+        }
+
+    }
+
+}
diff --git a/modules/src/main/java/org/archive/modules/extractor/Hop.java b/modules/src/main/java/org/archive/modules/extractor/Hop.java
@@ -55,6 +55,9 @@ public enum Hop {
      * material, but deduced by convention.
      */
     INFERRED('I'),
+
+    /** Found in some form of site provided URL manifest (e.g. site map) */ 
+    MANIFEST('M'),
     
     /** Synthesized form-submit */ 
     SUBMIT('S');
diff --git a/modules/src/main/java/org/archive/modules/extractor/LinkContext.java b/modules/src/main/java/org/archive/modules/extractor/LinkContext.java
@@ -76,6 +76,10 @@ public String toString() {
     final public static LinkContext PREREQ_MISC
      = new SimpleLinkContext("=PREREQ_MISC");
 
+    /** Stand-in value for prerequisite urls without other context. */
+    final public static LinkContext MANIFEST_MISC
+     = new SimpleLinkContext("=MANIFEST_MISC");
+
     public boolean equals(Object o) {
         if (o == this) {
             return true;