Skip to content

Commit 308cee8

Browse files
author
kristinn
committed
Add sitemap extraction to default profile
1 parent ba8f669 commit 308cee8

File tree

1 file changed

+9
-0
lines changed

1 file changed

+9
-0
lines changed

engine/src/main/resources/org/archive/crawler/restlet/profile-crawler-beans.cxml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -288,6 +288,11 @@ http://example.example/example
288288
</bean>
289289
<bean id="extractorHttp" class="org.archive.modules.extractor.ExtractorHTTP">
290290
</bean>
291+
<bean id="extractorRobotsTxt" class="org.archive.modules.extractor.ExtractorRobotsTxt">
292+
</bean>
293+
<bean id="extractorSitemap" class="org.archive.modules.extractor.ExtractorSitemap">
294+
</bean>
295+
291296
<bean id="extractorHtml" class="org.archive.modules.extractor.ExtractorHTML">
292297
<!-- <property name="extractJavascript" value="true" /> -->
293298
<!-- <property name="extractValueAttributes" value="true" /> -->
@@ -320,6 +325,10 @@ http://example.example/example
320325
<ref bean="fetchHttp"/>
321326
<!-- ...extract outlinks from HTTP headers... -->
322327
<ref bean="extractorHttp"/>
328+
<!-- ...extract sitemap urls from robots.txt... -->
329+
<ref bean="extractorRobotsTxt"/>
330+
<!-- ...extract links from sitemaps... -->
331+
<ref bean="extractorSitemap"/>
323332
<!-- ...extract outlinks from HTML content... -->
324333
<ref bean="extractorHtml"/>
325334
<!-- ...extract outlinks from CSS content... -->

0 commit comments

Comments
 (0)