Skip to content

Commit d7869de

Browse files
authored
Merge pull request #262 from kris-sigur/sitemaps
Support for extracting URLs in sitemaps
2 parents c1bcdd9 + 396467c commit d7869de

File tree

7 files changed

+314
-1
lines changed

7 files changed

+314
-1
lines changed

commons/pom.xml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@
5757
<dependency>
5858
<groupId>commons-io</groupId>
5959
<artifactId>commons-io</artifactId>
60-
<version>1.4</version>
60+
<version>2.4</version>
6161
<scope>compile</scope>
6262
</dependency>
6363
<dependency>

engine/src/main/resources/org/archive/crawler/restlet/profile-crawler-beans.cxml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -288,6 +288,11 @@ http://example.example/example
288288
</bean>
289289
<bean id="extractorHttp" class="org.archive.modules.extractor.ExtractorHTTP">
290290
</bean>
291+
<bean id="extractorRobotsTxt" class="org.archive.modules.extractor.ExtractorRobotsTxt">
292+
</bean>
293+
<bean id="extractorSitemap" class="org.archive.modules.extractor.ExtractorSitemap">
294+
</bean>
295+
291296
<bean id="extractorHtml" class="org.archive.modules.extractor.ExtractorHTML">
292297
<!-- <property name="extractJavascript" value="true" /> -->
293298
<!-- <property name="extractValueAttributes" value="true" /> -->
@@ -320,6 +325,10 @@ http://example.example/example
320325
<ref bean="fetchHttp"/>
321326
<!-- ...extract outlinks from HTTP headers... -->
322327
<ref bean="extractorHttp"/>
328+
<!-- ...extract sitemap urls from robots.txt... -->
329+
<ref bean="extractorRobotsTxt"/>
330+
<!-- ...extract links from sitemaps... -->
331+
<ref bean="extractorSitemap"/>
323332
<!-- ...extract outlinks from HTML content... -->
324333
<ref bean="extractorHtml"/>
325334
<!-- ...extract outlinks from CSS content... -->

modules/pom.xml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,11 @@
6262
<version>1.6.6</version>
6363
<scope>test</scope>
6464
</dependency>
65+
<dependency>
66+
<groupId>com.github.crawler-commons</groupId>
67+
<artifactId>crawler-commons</artifactId>
68+
<version>1.0</version>
69+
</dependency>
6570
<dependency>
6671
<groupId>com.jcraft</groupId>
6772
<artifactId>jsch</artifactId>
Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
package org.archive.modules.extractor;
2+
import java.io.BufferedReader;
3+
import java.io.IOException;
4+
import java.io.InputStream;
5+
import java.io.InputStreamReader;
6+
import java.util.ArrayList;
7+
import java.util.List;
8+
import java.util.logging.Level;
9+
import java.util.logging.Logger;
10+
import java.util.regex.Matcher;
11+
import java.util.regex.Pattern;
12+
13+
import org.apache.commons.httpclient.URIException;
14+
import org.archive.modules.CrawlURI;
15+
16+
public class ExtractorRobotsTxt extends ContentExtractor {
17+
private static final Logger LOGGER = Logger
18+
.getLogger(ExtractorRobotsTxt.class.getName());
19+
private static final Pattern ROBOTS_PATTERN = Pattern
20+
.compile("^https?://[^/]+/robots.txt$");
21+
private static final Pattern SITEMAP_PATTERN = Pattern
22+
.compile("(?i)Sitemap:\\s*(.+)$");
23+
24+
public static final String ANNOTATION_IS_SITEMAP = "isSitemap";
25+
26+
@Override
27+
protected boolean shouldExtract(CrawlURI uri) {
28+
boolean shouldExtract = false;
29+
if (uri.isPrerequisite()) {
30+
shouldExtract = ROBOTS_PATTERN.matcher(uri.getURI()).matches();
31+
LOGGER.finest("Checked prerequisite " + uri + " GOT " + shouldExtract);
32+
}
33+
return shouldExtract;
34+
}
35+
36+
public List<String> parseRobotsTxt(InputStream input) {
37+
ArrayList<String> links = new ArrayList<>();
38+
BufferedReader reader = new BufferedReader(new InputStreamReader(input));
39+
try {
40+
String line;
41+
Matcher matcher;
42+
while ((line = reader.readLine()) != null) {
43+
matcher = SITEMAP_PATTERN.matcher(line);
44+
if (matcher.matches()) {
45+
links.add(matcher.group(1));
46+
}
47+
}
48+
} catch (IOException e) {
49+
LOGGER.warning(e.toString());
50+
}
51+
return links;
52+
}
53+
54+
@Override
55+
protected boolean innerExtract(CrawlURI curi) {
56+
try {
57+
58+
// Clone the CrawlURI and change hop path and avoid queueing
59+
// sitemaps as prerequisites (i.e. strip P from hop path).
60+
CrawlURI curiClone = new CrawlURI(curi.getUURI(),
61+
curi.getPathFromSeed().replace("P", ""), curi.getVia(),
62+
curi.getViaContext());
63+
64+
// Also copy the source over:
65+
if (curi.getSourceTag() != null) {
66+
curiClone.setSourceTag(curi.getSourceTag());
67+
}
68+
69+
// Parse the robots for the sitemaps.
70+
List<String> links = parseRobotsTxt(
71+
curi.getRecorder()
72+
.getContentReplayInputStream());
73+
LOGGER.finest("Checked " + curi + " GOT " + links);
74+
75+
// Get the max outlinks (needed by add method):
76+
int max = getExtractorParameters().getMaxOutlinks();
77+
78+
// Accrue links:
79+
for (String link : links) {
80+
try {
81+
// We've found a sitemap:
82+
LOGGER.fine("Found site map: " + link);
83+
numberOfLinksExtracted.incrementAndGet();
84+
85+
// Add links but using the cloned CrawlURI as the crawl
86+
// context.
87+
CrawlURI newCuri = addRelativeToBase(curiClone, max, link,
88+
LinkContext.MANIFEST_MISC, Hop.MANIFEST);
89+
90+
// Annotate as a Site Map:
91+
newCuri.getAnnotations().add(
92+
ExtractorRobotsTxt.ANNOTATION_IS_SITEMAP);
93+
94+
} catch (URIException e) {
95+
logUriError(e, curi.getUURI(), link);
96+
}
97+
}
98+
99+
// Patch outlinks back into original curi:
100+
for (CrawlURI outlink : curiClone.getOutLinks()) {
101+
curi.getOutLinks().add(outlink);
102+
}
103+
104+
// Return number of links discovered:
105+
return !links.isEmpty();
106+
107+
} catch (IOException e) {
108+
LOGGER.log(Level.WARNING, curi.getURI(), e);
109+
curi.getNonFatalFailures().add(e);
110+
}
111+
return false;
112+
}
113+
114+
}
Lines changed: 178 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,178 @@
1+
package org.archive.modules.extractor;
2+
3+
import java.io.IOException;
4+
import java.net.URL;
5+
import java.util.Collection;
6+
import java.util.Date;
7+
import java.util.logging.Level;
8+
import java.util.logging.Logger;
9+
10+
import org.apache.commons.httpclient.URIException;
11+
import org.apache.commons.io.IOUtils;
12+
import org.archive.modules.CrawlURI;
13+
import org.archive.modules.extractor.ContentExtractor;
14+
import org.archive.modules.extractor.Hop;
15+
import org.archive.modules.extractor.LinkContext;
16+
17+
import crawlercommons.sitemaps.AbstractSiteMap;
18+
import crawlercommons.sitemaps.SiteMap;
19+
import crawlercommons.sitemaps.SiteMapIndex;
20+
import crawlercommons.sitemaps.SiteMapParser;
21+
import crawlercommons.sitemaps.SiteMapURL;
22+
import crawlercommons.sitemaps.UnknownFormatException;
23+
24+
/**
25+
*
26+
* @author Andrew Jackson <[email protected]>
27+
*
28+
*/
29+
public class ExtractorSitemap extends ContentExtractor {
30+
private static final Logger LOGGER = Logger
31+
.getLogger(ExtractorSitemap.class.getName());
32+
33+
/* (non-Javadoc)
34+
* @see org.archive.modules.extractor.ContentExtractor#shouldExtract(org.archive.modules.CrawlURI)
35+
*/
36+
@Override
37+
protected boolean shouldExtract(CrawlURI uri) {
38+
// If declared as such:
39+
if (uri.getAnnotations()
40+
.contains(ExtractorRobotsTxt.ANNOTATION_IS_SITEMAP)) {
41+
if (uri.is2XXSuccess()) {
42+
LOGGER.fine("This url (" + uri
43+
+ ") is declared to be a sitemap (via robots.txt) and is a HTTP 200.");
44+
return true;
45+
} else {
46+
LOGGER.fine("This url (" + uri
47+
+ ") is declared to be a sitemap (via robots.txt) but is a HTTP "
48+
+ uri.getFetchStatus() + ".");
49+
}
50+
}
51+
52+
// Via content type:
53+
String mimeType = uri.getContentType();
54+
if (mimeType != null ) {
55+
// Looks like XML:
56+
if (mimeType.toLowerCase().startsWith("text/xml")
57+
|| mimeType.toLowerCase().startsWith("application/xml")) {
58+
59+
// check if content starts with xml preamble "<?xml" and does
60+
// contain "<urlset " or "<sitemapindex" early in the content
61+
String contentStartingChunk = uri.getRecorder()
62+
.getContentReplayPrefixString(400);
63+
if (contentStartingChunk.matches("(?is)[\\ufeff]?<\\?xml\\s.*")
64+
&& contentStartingChunk.matches(
65+
"(?is).*(?:<urlset|<sitemapindex[>\\s]).*")) {
66+
LOGGER.info("Based on content sniffing, this is a sitemap: "
67+
+ uri);
68+
return true;
69+
}
70+
}
71+
}
72+
73+
// Otherwise, not
74+
return false;
75+
}
76+
77+
/* (non-Javadoc)
78+
* @see org.archive.modules.extractor.ContentExtractor#innerExtract(org.archive.modules.CrawlURI)
79+
*/
80+
@Override
81+
protected boolean innerExtract(CrawlURI uri) {
82+
// Parse the sitemap:
83+
AbstractSiteMap sitemap = parseSiteMap(uri);
84+
85+
// Did that work?
86+
if (sitemap != null) {
87+
// Process results:
88+
if (sitemap.isIndex()) {
89+
final Collection<AbstractSiteMap> links = ((SiteMapIndex) sitemap)
90+
.getSitemaps();
91+
for (final AbstractSiteMap asm : links) {
92+
if (asm == null) {
93+
continue;
94+
}
95+
this.recordOutlink(uri, asm.getUrl(), asm.getLastModified(),
96+
true);
97+
}
98+
} else {
99+
final Collection<SiteMapURL> links = ((SiteMap) sitemap)
100+
.getSiteMapUrls();
101+
for (final SiteMapURL url : links) {
102+
if (url == null) {
103+
continue;
104+
}
105+
this.recordOutlink(uri, url.getUrl(), url.getLastModified(),
106+
false);
107+
}
108+
}
109+
}
110+
111+
return false;
112+
}
113+
114+
/**
115+
* Parse the sitemap using the Crawler Commons content-sniffing parser.
116+
*
117+
* @param uri
118+
* @return
119+
*/
120+
private AbstractSiteMap parseSiteMap(CrawlURI uri) {
121+
// The thing we will create:
122+
AbstractSiteMap sitemap = null;
123+
124+
// Be strict about URLs but allow partial extraction:
125+
SiteMapParser smp = new SiteMapParser(true, true);
126+
// Parse it up:
127+
try {
128+
// Sitemaps are not supposed to be bigger than 50MB (according to
129+
// Google) so if we hit problems we can implement that limit:
130+
byte[] content = IOUtils.toByteArray(
131+
uri.getRecorder().getContentReplayInputStream());
132+
if (content.length > 52428800) {
133+
LOGGER.warning("Found sitemap exceeding 50MB " + uri + " "
134+
+ content.length);
135+
}
136+
// Now we can process it:
137+
sitemap = smp.parseSiteMap(content, new URL(uri.getURI()));
138+
} catch (IOException e) {
139+
LOGGER.log(Level.WARNING,
140+
"I/O Exception when parsing sitemap " + uri, e);
141+
} catch (UnknownFormatException e) {
142+
LOGGER.log(Level.WARNING,
143+
"UnknownFormatException when parsing sitemap " + uri, e);
144+
}
145+
return sitemap;
146+
}
147+
148+
private void recordOutlink(CrawlURI curi, URL newUri, Date lastModified,
149+
boolean isSitemap) {
150+
try {
151+
// Get the max outlinks (needed by add method):
152+
//
153+
// Because sitemaps are really important we excuse this extractor
154+
// from the general setting:
155+
//
156+
// getExtractorParameters().getMaxOutlinks();
157+
//
158+
// And instead use the maximum that is allowed for a sitemap:
159+
int max = 50000;
160+
161+
// Add the URI:
162+
// Adding 'regular' URL listed in the sitemap
163+
addRelativeToBase(curi, max, newUri.toString(),
164+
LinkContext.MANIFEST_MISC, Hop.MANIFEST);
165+
166+
// And log about it:
167+
LOGGER.fine("Found " + newUri + " from " + curi + " Dated "
168+
+ lastModified + " and with isSitemap = " + isSitemap);
169+
// Count it:
170+
numberOfLinksExtracted.incrementAndGet();
171+
} catch (URIException e) {
172+
LOGGER.log(Level.WARNING,
173+
"URIException when recording outlink " + newUri, e);
174+
}
175+
176+
}
177+
178+
}

modules/src/main/java/org/archive/modules/extractor/Hop.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,9 @@ public enum Hop {
5555
* material, but deduced by convention.
5656
*/
5757
INFERRED('I'),
58+
59+
/** Found in some form of site provided URL manifest (e.g. site map) */
60+
MANIFEST('M'),
5861

5962
/** Synthesized form-submit */
6063
SUBMIT('S');

modules/src/main/java/org/archive/modules/extractor/LinkContext.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,10 @@ public String toString() {
7676
final public static LinkContext PREREQ_MISC
7777
= new SimpleLinkContext("=PREREQ_MISC");
7878

79+
/** Stand-in value for prerequisite urls without other context. */
80+
final public static LinkContext MANIFEST_MISC
81+
= new SimpleLinkContext("=MANIFEST_MISC");
82+
7983
public boolean equals(Object o) {
8084
if (o == this) {
8185
return true;

0 commit comments

Comments
 (0)