From 1ed207976c539dd282268d98db4307b2f195de23 Mon Sep 17 00:00:00 2001 From: nruest Date: Mon, 22 Apr 2019 09:17:23 -0400 Subject: [PATCH 001/123] Update README and add LICENSE.txt - Add badges to README - Update Markdown - Clean-up formatting - Add LICENSE.txt in root - Partially addresses #233 --- LICENSE | 11 +++++++++ README.md | 71 +++++++++++++++++-------------------------------------- 2 files changed, 33 insertions(+), 49 deletions(-) create mode 100644 LICENSE diff --git a/LICENSE b/LICENSE new file mode 100644 index 000000000..90edcee40 --- /dev/null +++ b/LICENSE @@ -0,0 +1,11 @@ +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. diff --git a/README.md b/README.md index 5f3074336..c1a47653f 100644 --- a/README.md +++ b/README.md @@ -1,64 +1,37 @@ -Readme for Heritrix -==================== +# Heritrix +[![Build Status](https://travis-ci.org/internetarchive/heritrix3.svg?branch=master)](https://travis-ci.org/internetarchive/heritrix3) +[![Maven Central](https://maven-badges.herokuapp.com/maven-central/org.archive/heritrix/badge.svg)](https://maven-badges.herokuapp.com/maven-central/org.archive/heritrix) +[![Javadoc](https://javadoc-badge.appspot.com/org.archive/heritrix.svg?label=javadoc)](http://builds.archive.org/javadoc/heritrix-3.2.0/) +[![LICENSE](https://img.shields.io/badge/license-Apache-blue.svg?style=flat-square)](./LICENSE) -1. Introduction -2. Crawl Operators! -3. Getting Started -4. Developer Documentation -5. Release History -6. License +## Introduction +Heritrix is the Internet Archive's open-source, extensible, web-scale, archival-quality web crawler project. Heritrix (sometimes spelled heretrix, or misspelled or missaid as heratrix/heritix/heretix/heratix) is an archaic word for heiress (woman who inherits). Since our crawler seeks to collect and preserve the digital artifacts of our culture for the benefit of future researchers and generations, this name seemed apt. -## 1. Introduction +## Crawl Operators! -Heritrix is the Internet Archive's open-source, extensible, web-scale, -archival-quality web crawler project. Heritrix (sometimes spelled heretrix, or -misspelled or missaid as heratrix/heritix/heretix/heratix) is an archaic word -for heiress (woman who inherits). Since our crawler seeks to collect and -preserve the digital artifacts of our culture for the benefit of future -researchers and generations, this name seemed apt. +Heritrix is designed to respect the [`robots.txt`](http://www.robotstxt.org/wc/robots.html) exclusion directives and [META robots tags](http://www.robotstxt.org/wc/exclusion.html#meta). Please consider the +load your crawl will place on seed sites and set politeness policies accordingly. Also, always identify your crawl with contact information in the `User-Agent` so sites that may be adversely affected by your crawl can contact you or adapt their server behavior accordingly. +## Getting Started -## 2. Crawl Operators! +- [User Manual](https://github.com/internetarchive/heritrix3/wiki) -Heritrix is designed to respect the robots.txt - exclusion directives and META robots -tags . Please consider the -load your crawl will place on seed sites and set politeness policies -accordingly. Also, always identify your crawl with contact information in the -User-Agent so sites that may be adversely affected by your crawl can contact -you or adapt their server behavior accordingly. +## Developer Documentation +- [Developer Manual](http://crawler.archive.org/articles/developer_manual/index.html) +- [REST API documentation](https://heritrix.readthedocs.io/en/latest/api.html) +- [JavaDoc](http://builds.archive.org/javadoc/heritrix-3.2.0/) (n.b. Javadoc currently out of date) -## 3. Getting Started -See the User Manual, available from +## Latest Releases +Information about releases can be found [here](https://github.com/internetarchive/heritrix3/wiki#latest-releases). -## 4. Developer Documentation +## License -See . -For REST API documentation, see -and for JavaDoc see (n.b. Javadoc currently out of date). +Heritrix is free software; you can redistribute it and/or modify it under the terms of the [Apache License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0) +Some individual source code files are subject to or offered under other licenses. See the included [`LICENSE.txt`](./LICENSE) file for more information. -## 5. Latest Releases - -Information about releases can be found at - - -## 6. License - -Heritrix is free software; you can redistribute it and/or modify it -under the terms of the Apache License, Version 2.0: - - http://www.apache.org/licenses/LICENSE-2.0 - -Some individual source code files are subject to or offered under other -licenses. See the included LICENSE.txt file for more information. - -Heritrix is distributed with the libraries it depends upon. The -libraries can be found under the 'lib' directory, and are used under -the terms of their respective licenses, which are included alongside -the libraries in the 'lib' directory. - +Heritrix is distributed with the libraries it depends upon. The libraries can be found under the `lib` directory in the release distribution, and are used under the terms of their respective licenses, which are included alongside the libraries in the `lib` directory. From 25848e982177d09c5d51336ebb708836ed42ab4c Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 7 Jun 2019 14:24:48 -0700 Subject: [PATCH 002/123] cap number of videos per page to 1000 --- .../org/archive/modules/extractor/ExtractorYoutubeDL.java | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/contrib/src/main/java/org/archive/modules/extractor/ExtractorYoutubeDL.java b/contrib/src/main/java/org/archive/modules/extractor/ExtractorYoutubeDL.java index a1f2df97d..a729efc46 100644 --- a/contrib/src/main/java/org/archive/modules/extractor/ExtractorYoutubeDL.java +++ b/contrib/src/main/java/org/archive/modules/extractor/ExtractorYoutubeDL.java @@ -79,6 +79,8 @@ public class ExtractorYoutubeDL extends Extractor implements Lifecycle { protected static final String YDL_CONTAINING_PAGE_TIMESTAMP = "ydl-containing-page-timestamp"; protected static final String YDL_CONTAINING_PAGE_URI = "ydl-containing-page-uri"; + protected static final int MAX_VIDEOS_PER_PAGE = 1000; + protected transient Logger ydlLogger = null; protected CrawlerLoggerModule crawlerLoggerModule; @@ -305,7 +307,8 @@ protected List runYoutubeDL(CrawlURI uri) { * https://github.com/ytdl-org/youtube-dl/blob/master/README.md#format-selection */ ProcessBuilder pb = new ProcessBuilder("youtube-dl", "--ignore-config", - "--simulate", "--dump-json", "--format=best", uri.toString()); + "--simulate", "--dump-json", "--format=best", + "--playlist-end=" + MAX_VIDEOS_PER_PAGE, uri.toString()); logger.fine("running " + pb.command()); Process proc = null; From 4d6314bc5066c78e8cc6714c32df07cd0fcc0c5b Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 7 Jun 2019 15:14:32 -0700 Subject: [PATCH 003/123] use --dump-single-json seems better/cleaner and we will want the single json form if/when we start writing it to warc --- .../modules/extractor/ExtractorYoutubeDL.java | 32 ++++++++----------- 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/contrib/src/main/java/org/archive/modules/extractor/ExtractorYoutubeDL.java b/contrib/src/main/java/org/archive/modules/extractor/ExtractorYoutubeDL.java index a729efc46..d250fd05f 100644 --- a/contrib/src/main/java/org/archive/modules/extractor/ExtractorYoutubeDL.java +++ b/contrib/src/main/java/org/archive/modules/extractor/ExtractorYoutubeDL.java @@ -22,8 +22,6 @@ import java.io.IOException; import java.io.InputStreamReader; import java.io.Reader; -import java.util.ArrayList; -import java.util.List; import java.util.concurrent.Callable; import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutorService; @@ -43,6 +41,8 @@ import org.springframework.beans.factory.annotation.Autowired; import org.springframework.context.Lifecycle; +import com.google.gson.JsonArray; +import com.google.gson.JsonElement; import com.google.gson.JsonObject; import com.google.gson.JsonParseException; import com.google.gson.JsonStreamParser; @@ -139,15 +139,17 @@ protected void extract(CrawlURI uri) { logCapturedVideo(uri, ydlAnnotation); } } else { - List ydlJsons = runYoutubeDL(uri); - if (ydlJsons != null && !ydlJsons.isEmpty()) { - for (JsonObject json: ydlJsons) { + JsonObject ydlJson = runYoutubeDL(uri); + if (ydlJson != null && ydlJson.has("entries")) { + JsonArray jsonEntries = ydlJson.getAsJsonArray("entries"); + for (JsonElement e: jsonEntries) { + JsonObject json = (JsonObject) e; if (json.get("url") != null) { String videoUrl = json.get("url").getAsString(); addVideoOutlink(uri, json, videoUrl); } } - String annotation = "youtube-dl:" + ydlJsons.size(); + String annotation = "youtube-dl:" + jsonEntries.size(); uri.getAnnotations().add(annotation); logContainingPage(uri, annotation); } @@ -292,13 +294,7 @@ public String call() throws IOException { return output; } - /** - * - * @param uri - * @return list of json blobs returned by {@code youtube-dl --dump-json}, or - * empty list if no videos found, or failure - */ - protected List runYoutubeDL(CrawlURI uri) { + protected JsonObject runYoutubeDL(CrawlURI uri) { /* * --format=best * @@ -307,7 +303,7 @@ protected List runYoutubeDL(CrawlURI uri) { * https://github.com/ytdl-org/youtube-dl/blob/master/README.md#format-selection */ ProcessBuilder pb = new ProcessBuilder("youtube-dl", "--ignore-config", - "--simulate", "--dump-json", "--format=best", + "--simulate", "--dump-single-json", "--format=best", "--playlist-end=" + MAX_VIDEOS_PER_PAGE, uri.toString()); logger.fine("running " + pb.command()); @@ -347,11 +343,11 @@ protected List runYoutubeDL(CrawlURI uri) { proc.destroyForcibly(); } - List ydlJsons = new ArrayList(); JsonStreamParser parser = new JsonStreamParser(output.stdout); + JsonObject ydlJson = null; try { - while (parser.hasNext()) { - ydlJsons.add((JsonObject) parser.next()); + if (parser.hasNext()) { + ydlJson = (JsonObject) parser.next(); } } catch (JsonParseException e) { // sometimes we get no output at all from youtube-dl, which @@ -364,7 +360,7 @@ protected List runYoutubeDL(CrawlURI uri) { return null; } - return ydlJsons; + return ydlJson; } @Override From 9435f761a6a6228d17dd89474a560a632ff45715 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 11 Jun 2019 13:45:51 -0700 Subject: [PATCH 004/123] configurable warc writer chain exercised only lightly at this point --- .../restlet/profile-crawler-beans.cxml | 18 +- .../warc/DnsResponseRecordBuilder.java | 49 ++++ .../FtpControlConversationRecordBuilder.java | 51 ++++ .../warc/FtpResponseRecordBuilder.java | 53 ++++ .../warc/HttpRequestRecordBuilder.java | 46 ++++ .../warc/HttpResponseRecordBuilder.java | 82 ++++++ .../modules/warc/MetadataRecordBuilder.java | 111 ++++++++ .../modules/warc/RevisitRecordBuilder.java | 71 +++++ .../modules/warc/WARCRecordBuilder.java | 72 +++++ .../warc/WhoisResponseRecordBuilder.java | 48 ++++ .../writer/BaseWARCWriterProcessor.java | 253 ++++++++++++++++++ .../writer/WARCWriterChainProcessor.java | 126 +++++++++ .../modules/writer/WARCWriterProcessor.java | 231 +--------------- .../modules/writer/WriterPoolProcessor.java | 4 + 14 files changed, 983 insertions(+), 232 deletions(-) create mode 100644 modules/src/main/java/org/archive/modules/warc/DnsResponseRecordBuilder.java create mode 100644 modules/src/main/java/org/archive/modules/warc/FtpControlConversationRecordBuilder.java create mode 100644 modules/src/main/java/org/archive/modules/warc/FtpResponseRecordBuilder.java create mode 100644 modules/src/main/java/org/archive/modules/warc/HttpRequestRecordBuilder.java create mode 100644 modules/src/main/java/org/archive/modules/warc/HttpResponseRecordBuilder.java create mode 100644 modules/src/main/java/org/archive/modules/warc/MetadataRecordBuilder.java create mode 100644 modules/src/main/java/org/archive/modules/warc/RevisitRecordBuilder.java create mode 100644 modules/src/main/java/org/archive/modules/warc/WARCRecordBuilder.java create mode 100644 modules/src/main/java/org/archive/modules/warc/WhoisResponseRecordBuilder.java create mode 100644 modules/src/main/java/org/archive/modules/writer/BaseWARCWriterProcessor.java create mode 100644 modules/src/main/java/org/archive/modules/writer/WARCWriterChainProcessor.java diff --git a/engine/src/main/resources/org/archive/crawler/restlet/profile-crawler-beans.cxml b/engine/src/main/resources/org/archive/crawler/restlet/profile-crawler-beans.cxml index 97b35c27a..96106c803 100644 --- a/engine/src/main/resources/org/archive/crawler/restlet/profile-crawler-beans.cxml +++ b/engine/src/main/resources/org/archive/crawler/restlet/profile-crawler-beans.cxml @@ -334,7 +334,7 @@ http://example.example/example - + @@ -349,11 +349,19 @@ http://example.example/example --> - - - - + + + + + + + + + + + + diff --git a/modules/src/main/java/org/archive/modules/warc/DnsResponseRecordBuilder.java b/modules/src/main/java/org/archive/modules/warc/DnsResponseRecordBuilder.java new file mode 100644 index 000000000..2751b3a04 --- /dev/null +++ b/modules/src/main/java/org/archive/modules/warc/DnsResponseRecordBuilder.java @@ -0,0 +1,49 @@ +package org.archive.modules.warc; + +import static org.archive.format.warc.WARCConstants.HEADER_KEY_IP; +import static org.archive.modules.CoreAttributeConstants.A_DNS_SERVER_IP_LABEL; + +import java.io.IOException; +import java.net.URI; + +import org.archive.format.warc.WARCConstants.WARCRecordType; +import org.archive.io.ReplayInputStream; +import org.archive.io.warc.WARCRecordInfo; +import org.archive.modules.CrawlURI; +import org.archive.util.ArchiveUtils; + +public class DnsResponseRecordBuilder extends WARCRecordBuilder { + + @Override + public boolean shouldProcess(CrawlURI curi) { + return "dns".equals(curi.getUURI().getScheme().toLowerCase()); + } + + @Override + public WARCRecordInfo buildRecord(CrawlURI curi, URI concurrentTo) throws IOException { + final String timestamp = + ArchiveUtils.getLog14Date(curi.getFetchBeginTime()); + + WARCRecordInfo recordInfo = new WARCRecordInfo(); + recordInfo.setType(WARCRecordType.response); + recordInfo.setUrl(curi.toString()); + recordInfo.setCreate14DigitDate(timestamp); + recordInfo.setMimetype(curi.getContentType()); + recordInfo.setRecordId(generateRecordID()); + + recordInfo.setContentLength(curi.getRecorder().getRecordedInput().getSize()); + recordInfo.setEnforceLength(true); + + String ip = (String)curi.getData().get(A_DNS_SERVER_IP_LABEL); + if (ip != null && ip.length() > 0) { + recordInfo.addExtraHeader(HEADER_KEY_IP, ip); + } + + ReplayInputStream ris = + curi.getRecorder().getRecordedInput().getReplayInputStream(); + recordInfo.setContentStream(ris); + + return recordInfo; + } + +} diff --git a/modules/src/main/java/org/archive/modules/warc/FtpControlConversationRecordBuilder.java b/modules/src/main/java/org/archive/modules/warc/FtpControlConversationRecordBuilder.java new file mode 100644 index 000000000..3d85c2eb0 --- /dev/null +++ b/modules/src/main/java/org/archive/modules/warc/FtpControlConversationRecordBuilder.java @@ -0,0 +1,51 @@ +package org.archive.modules.warc; + +import static org.archive.format.warc.WARCConstants.FTP_CONTROL_CONVERSATION_MIMETYPE; +import static org.archive.format.warc.WARCConstants.HEADER_KEY_IP; +import static org.archive.modules.CoreAttributeConstants.A_FTP_CONTROL_CONVERSATION; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.net.URI; + +import org.archive.format.warc.WARCConstants.WARCRecordType; +import org.archive.io.warc.WARCRecordInfo; +import org.archive.modules.CrawlURI; +import org.archive.util.ArchiveUtils; +import org.archive.util.anvl.ANVLRecord; + +public class FtpControlConversationRecordBuilder extends WARCRecordBuilder { + + @Override + public boolean shouldProcess(CrawlURI curi) { + return "ftp".equals(curi.getUURI().getScheme().toLowerCase()); + } + + @Override + public WARCRecordInfo buildRecord(CrawlURI curi, URI concurrentTo) throws IOException { + final String timestamp = + ArchiveUtils.getLog14Date(curi.getFetchBeginTime()); + String controlConversation = + curi.getData().get(A_FTP_CONTROL_CONVERSATION).toString(); + ANVLRecord headers = new ANVLRecord(); + headers.addLabelValue(HEADER_KEY_IP, getHostAddress(curi)); + + WARCRecordInfo recordInfo = new WARCRecordInfo(); + recordInfo.setCreate14DigitDate(timestamp); + recordInfo.setUrl(curi.toString()); + recordInfo.setMimetype(FTP_CONTROL_CONVERSATION_MIMETYPE); + recordInfo.setExtraHeaders(headers); + recordInfo.setEnforceLength(true); + recordInfo.setType(WARCRecordType.metadata); + + recordInfo.setRecordId(generateRecordID()); + + byte[] b = controlConversation.getBytes("UTF-8"); + + recordInfo.setContentStream(new ByteArrayInputStream(b)); + recordInfo.setContentLength((long) b.length); + + return recordInfo; + } + +} diff --git a/modules/src/main/java/org/archive/modules/warc/FtpResponseRecordBuilder.java b/modules/src/main/java/org/archive/modules/warc/FtpResponseRecordBuilder.java new file mode 100644 index 000000000..60366ec06 --- /dev/null +++ b/modules/src/main/java/org/archive/modules/warc/FtpResponseRecordBuilder.java @@ -0,0 +1,53 @@ +package org.archive.modules.warc; + +import static org.archive.format.warc.WARCConstants.HEADER_KEY_CONCURRENT_TO; +import static org.archive.format.warc.WARCConstants.HEADER_KEY_IP; +import static org.archive.modules.CoreAttributeConstants.A_DNS_SERVER_IP_LABEL; + +import java.io.IOException; +import java.net.URI; + +import org.archive.format.warc.WARCConstants.WARCRecordType; +import org.archive.io.ReplayInputStream; +import org.archive.io.warc.WARCRecordInfo; +import org.archive.modules.CrawlURI; +import org.archive.util.ArchiveUtils; + +public class FtpResponseRecordBuilder extends WARCRecordBuilder { + + @Override + public boolean shouldProcess(CrawlURI curi) { + return !curi.isRevisit() + && "ftp".equals(curi.getUURI().getScheme().toLowerCase()); + } + + @Override + public WARCRecordInfo buildRecord(CrawlURI curi, URI concurrentTo) throws IOException { + final String timestamp = + ArchiveUtils.getLog14Date(curi.getFetchBeginTime()); + + WARCRecordInfo recordInfo = new WARCRecordInfo(); + recordInfo.setRecordId(generateRecordID()); + recordInfo.addExtraHeader(HEADER_KEY_CONCURRENT_TO, + '<' + concurrentTo.toString() + '>'); + recordInfo.setType(WARCRecordType.response); + recordInfo.setUrl(curi.toString()); + recordInfo.setCreate14DigitDate(timestamp); + recordInfo.setMimetype(curi.getContentType()); + + recordInfo.setContentLength(curi.getRecorder().getRecordedInput().getSize()); + recordInfo.setEnforceLength(true); + + String ip = (String)curi.getData().get(A_DNS_SERVER_IP_LABEL); + if (ip != null && ip.length() > 0) { + recordInfo.addExtraHeader(HEADER_KEY_IP, ip); + } + + ReplayInputStream ris = + curi.getRecorder().getRecordedInput().getReplayInputStream(); + recordInfo.setContentStream(ris); + + return recordInfo; + } + +} diff --git a/modules/src/main/java/org/archive/modules/warc/HttpRequestRecordBuilder.java b/modules/src/main/java/org/archive/modules/warc/HttpRequestRecordBuilder.java new file mode 100644 index 000000000..b71567aab --- /dev/null +++ b/modules/src/main/java/org/archive/modules/warc/HttpRequestRecordBuilder.java @@ -0,0 +1,46 @@ +package org.archive.modules.warc; + +import static org.archive.format.warc.WARCConstants.HEADER_KEY_CONCURRENT_TO; +import static org.archive.format.warc.WARCConstants.HTTP_REQUEST_MIMETYPE; + +import java.io.IOException; +import java.net.URI; + +import org.archive.format.warc.WARCConstants.WARCRecordType; +import org.archive.io.ReplayInputStream; +import org.archive.io.warc.WARCRecordInfo; +import org.archive.modules.CrawlURI; +import org.archive.util.ArchiveUtils; + +public class HttpRequestRecordBuilder extends WARCRecordBuilder { + + @Override + public boolean shouldProcess(CrawlURI curi) { + return curi.getUURI().getScheme().toLowerCase().startsWith("http"); + } + + @Override + public WARCRecordInfo buildRecord(CrawlURI curi, URI concurrentTo) + throws IOException { + final String timestamp = + ArchiveUtils.getLog14Date(curi.getFetchBeginTime()); + + WARCRecordInfo recordInfo = new WARCRecordInfo(); + recordInfo.setRecordId(generateRecordID()); + recordInfo.addExtraHeader(HEADER_KEY_CONCURRENT_TO, + "<" + concurrentTo + ">"); + recordInfo.setType(WARCRecordType.request); + recordInfo.setUrl(curi.toString()); + recordInfo.setCreate14DigitDate(timestamp); + recordInfo.setMimetype(HTTP_REQUEST_MIMETYPE); + recordInfo.setContentLength(curi.getRecorder().getRecordedOutput().getSize()); + recordInfo.setEnforceLength(true); + + ReplayInputStream + ris = curi.getRecorder().getRecordedOutput().getReplayInputStream(); + recordInfo.setContentStream(ris); + + return recordInfo; + } + +} diff --git a/modules/src/main/java/org/archive/modules/warc/HttpResponseRecordBuilder.java b/modules/src/main/java/org/archive/modules/warc/HttpResponseRecordBuilder.java new file mode 100644 index 000000000..0f83f5e9a --- /dev/null +++ b/modules/src/main/java/org/archive/modules/warc/HttpResponseRecordBuilder.java @@ -0,0 +1,82 @@ +package org.archive.modules.warc; + +import static org.archive.format.warc.WARCConstants.HEADER_KEY_PAYLOAD_DIGEST; +import static org.archive.format.warc.WARCConstants.HEADER_KEY_TRUNCATED; +import static org.archive.format.warc.WARCConstants.HTTP_RESPONSE_MIMETYPE; +import static org.archive.format.warc.WARCConstants.NAMED_FIELD_TRUNCATED_VALUE_HEAD; +import static org.archive.format.warc.WARCConstants.NAMED_FIELD_TRUNCATED_VALUE_LENGTH; +import static org.archive.format.warc.WARCConstants.NAMED_FIELD_TRUNCATED_VALUE_TIME; +import static org.archive.modules.CoreAttributeConstants.A_WARC_RESPONSE_HEADERS; +import static org.archive.modules.CoreAttributeConstants.HEADER_TRUNC; +import static org.archive.modules.CoreAttributeConstants.LENGTH_TRUNC; +import static org.archive.modules.CoreAttributeConstants.TIMER_TRUNC; + +import java.io.IOException; +import java.net.URI; +import java.util.Collection; + +import org.apache.commons.lang.StringUtils; +import org.archive.format.warc.WARCConstants.WARCRecordType; +import org.archive.io.ReplayInputStream; +import org.archive.io.warc.WARCRecordInfo; +import org.archive.modules.CrawlURI; +import org.archive.util.ArchiveUtils; + +public class HttpResponseRecordBuilder extends WARCRecordBuilder { + + @Override + public boolean shouldProcess(CrawlURI curi) { + return !curi.isRevisit() + && "http".equals(curi.getUURI().getScheme().toLowerCase()); + } + + @Override + public WARCRecordInfo buildRecord(CrawlURI curi, URI concurrentTo) throws IOException { + final String timestamp = + ArchiveUtils.getLog14Date(curi.getFetchBeginTime()); + + WARCRecordInfo recordInfo = new WARCRecordInfo(); + recordInfo.setType(WARCRecordType.response); + recordInfo.setRecordId(generateRecordID()); + recordInfo.setUrl(curi.toString()); + recordInfo.setCreate14DigitDate(timestamp); + recordInfo.setMimetype(HTTP_RESPONSE_MIMETYPE); + recordInfo.setContentLength( + curi.getRecorder().getRecordedInput().getSize()); + recordInfo.setEnforceLength(true); + + if (curi.getContentDigest() != null) { + recordInfo.addExtraHeader(HEADER_KEY_PAYLOAD_DIGEST, + curi.getContentDigestSchemeString()); + } + + // Check for truncated annotation + String value = null; + Collection anno = curi.getAnnotations(); + if (anno.contains(TIMER_TRUNC)) { + value = NAMED_FIELD_TRUNCATED_VALUE_TIME; + } else if (anno.contains(LENGTH_TRUNC)) { + value = NAMED_FIELD_TRUNCATED_VALUE_LENGTH; + } else if (anno.contains(HEADER_TRUNC)) { + value = NAMED_FIELD_TRUNCATED_VALUE_HEAD; + } + // TODO: Add annotation for TRUNCATED_VALUE_UNSPECIFIED + if (value != null) { + recordInfo.addExtraHeader(HEADER_KEY_TRUNCATED, value); + } + + if (curi.getData().containsKey(A_WARC_RESPONSE_HEADERS)) { + for (Object headerObj: curi.getDataList(A_WARC_RESPONSE_HEADERS)) { + String[] kv = StringUtils.split(((String) headerObj), ":", 2); + recordInfo.addExtraHeader(kv[0].trim(), kv[1].trim()); + } + } + + ReplayInputStream ris = + curi.getRecorder().getRecordedInput().getReplayInputStream(); + recordInfo.setContentStream(ris); + + return recordInfo; + } + +} diff --git a/modules/src/main/java/org/archive/modules/warc/MetadataRecordBuilder.java b/modules/src/main/java/org/archive/modules/warc/MetadataRecordBuilder.java new file mode 100644 index 000000000..51a1547ad --- /dev/null +++ b/modules/src/main/java/org/archive/modules/warc/MetadataRecordBuilder.java @@ -0,0 +1,111 @@ +package org.archive.modules.warc; + +import static org.archive.format.warc.WARCConstants.HEADER_KEY_CONCURRENT_TO; +import static org.archive.modules.CoreAttributeConstants.A_FTP_FETCH_STATUS; +import static org.archive.modules.CoreAttributeConstants.A_SOURCE_TAG; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.net.URI; +import java.util.Collection; + +import org.apache.commons.lang.StringUtils; +import org.archive.format.warc.WARCConstants.WARCRecordType; +import org.archive.io.warc.WARCRecordInfo; +import org.archive.modules.CrawlURI; +import org.archive.util.ArchiveUtils; +import org.archive.util.anvl.ANVLRecord; + +public class MetadataRecordBuilder extends WARCRecordBuilder { + + /** + * If you don't want metadata records, take this class out of the chain. + */ + @Override + public boolean shouldProcess(CrawlURI curi) { + String scheme = curi.getUURI().getScheme().toLowerCase(); + return scheme.startsWith("http") || "ftp".equals(scheme); + } + + @Override + public WARCRecordInfo buildRecord(CrawlURI curi, URI concurrentTo) throws IOException { + final String timestamp = + ArchiveUtils.getLog14Date(curi.getFetchBeginTime()); + WARCRecordInfo recordInfo = new WARCRecordInfo(); + recordInfo.setType(WARCRecordType.metadata); + recordInfo.setRecordId(generateRecordID()); + if (concurrentTo != null) { + recordInfo.addExtraHeader(HEADER_KEY_CONCURRENT_TO, + "<" + concurrentTo + ">"); + } + recordInfo.setUrl(curi.toString()); + recordInfo.setCreate14DigitDate(timestamp); + recordInfo.setMimetype(ANVLRecord.MIMETYPE); + recordInfo.setEnforceLength(true); + + // Get some metadata from the curi. + // TODO: Get all curi metadata. + // TODO: Use other than ANVL (or rename ANVL as NameValue or use + // RFC822 (commons-httpclient?). + ANVLRecord r = new ANVLRecord(); + if (curi.isSeed()) { + r.addLabel("seed"); + } else { + if (curi.forceFetch()) { + r.addLabel("force-fetch"); + } + if(StringUtils.isNotBlank(curi.getVia().toString())) { + r.addLabelValue("via", curi.getVia().toString()); + } + if(StringUtils.isNotBlank(curi.getPathFromSeed())) { + r.addLabelValue("hopsFromSeed", curi.getPathFromSeed()); + } + if (curi.containsDataKey(A_SOURCE_TAG)) { + r.addLabelValue("sourceTag", + (String)curi.getData().get(A_SOURCE_TAG)); + } + } + long duration = curi.getFetchCompletedTime() - curi.getFetchBeginTime(); + if (duration > -1) { + r.addLabelValue("fetchTimeMs", Long.toString(duration)); + } + + if (curi.getData().containsKey(A_FTP_FETCH_STATUS)) { + r.addLabelValue("ftpFetchStatus", curi.getData().get(A_FTP_FETCH_STATUS).toString()); + } + + if (curi.getRecorder() != null && curi.getRecorder().getCharset() != null) { + r.addLabelValue("charsetForLinkExtraction", curi.getRecorder().getCharset().name()); + } + + for (String annotation: curi.getAnnotations()) { + if (annotation.startsWith("usingCharsetIn") || annotation.startsWith("inconsistentCharsetIn")) { + String[] kv = annotation.split(":", 2); + r.addLabelValue(kv[0], kv[1]); + } + } + + // Add outlinks though they are effectively useless without anchor text. + Collection links = curi.getOutLinks(); + if (links != null && links.size() > 0) { + for (CrawlURI link: links) { + r.addLabelValue("outlink", link.getURI()+" "+link.getLastHop()+" "+link.getViaContext()); + } + } + + // TODO: Other curi fields to write to metadata. + // + // Credentials + // + // fetch-began-time: 1154569278774 + // fetch-completed-time: 1154569281816 + // + // Annotations. + + byte [] b = r.getUTF8Bytes(); + recordInfo.setContentStream(new ByteArrayInputStream(b)); + recordInfo.setContentLength((long) b.length); + + return recordInfo; + } +} diff --git a/modules/src/main/java/org/archive/modules/warc/RevisitRecordBuilder.java b/modules/src/main/java/org/archive/modules/warc/RevisitRecordBuilder.java new file mode 100644 index 000000000..525fd8e94 --- /dev/null +++ b/modules/src/main/java/org/archive/modules/warc/RevisitRecordBuilder.java @@ -0,0 +1,71 @@ +package org.archive.modules.warc; + +import static org.archive.format.warc.WARCConstants.HEADER_KEY_PROFILE; +import static org.archive.format.warc.WARCConstants.HEADER_KEY_TRUNCATED; +import static org.archive.format.warc.WARCConstants.HTTP_RESPONSE_MIMETYPE; +import static org.archive.format.warc.WARCConstants.NAMED_FIELD_TRUNCATED_VALUE_LENGTH; +import static org.archive.format.warc.WARCConstants.PROFILE_REVISIT_IDENTICAL_DIGEST; + +import java.io.IOException; +import java.net.URI; +import java.util.Map; +import java.util.Map.Entry; + +import org.archive.format.warc.WARCConstants.WARCRecordType; +import org.archive.io.ReplayInputStream; +import org.archive.io.warc.WARCRecordInfo; +import org.archive.modules.CrawlURI; +import org.archive.modules.revisit.RevisitProfile; +import org.archive.util.ArchiveUtils; + +public class RevisitRecordBuilder extends WARCRecordBuilder { + + @Override + public boolean shouldProcess(CrawlURI curi) { + return curi.isRevisit(); + } + + @Override + public WARCRecordInfo buildRecord(CrawlURI curi, URI concurrentTo) throws IOException { + final String timestamp = + ArchiveUtils.getLog14Date(curi.getFetchBeginTime()); + + long revisedLength = 0; // By default, truncate all data + if (curi.getRevisitProfile().getProfileName().equals(PROFILE_REVISIT_IDENTICAL_DIGEST)) { + // Save response from identical digest matches + revisedLength = curi.getRecorder().getRecordedInput().getContentBegin(); + revisedLength = revisedLength > 0 + ? revisedLength + : curi.getRecorder().getRecordedInput().getSize(); + } + + WARCRecordInfo recordInfo = new WARCRecordInfo(); + recordInfo.setType(WARCRecordType.revisit); + recordInfo.setRecordId(generateRecordID()); + recordInfo.setUrl(curi.toString()); + recordInfo.setCreate14DigitDate(timestamp); + String scheme = curi.getUURI().getScheme().toLowerCase(); + if (scheme.startsWith("http")) { + recordInfo.setMimetype(HTTP_RESPONSE_MIMETYPE); + } + recordInfo.setContentLength(revisedLength); + recordInfo.setEnforceLength(false); + + RevisitProfile revisitProfile = curi.getRevisitProfile(); + recordInfo.addExtraHeader(HEADER_KEY_PROFILE, + revisitProfile.getProfileName()); + recordInfo.addExtraHeader(HEADER_KEY_TRUNCATED, + NAMED_FIELD_TRUNCATED_VALUE_LENGTH); + + Map revisitHeaders = revisitProfile.getWarcHeaders(); + for (Entry entry: revisitHeaders.entrySet()) { + recordInfo.addExtraHeader(entry.getKey(), entry.getValue()); + } + + ReplayInputStream ris = curi.getRecorder().getRecordedInput().getReplayInputStream(); + recordInfo.setContentStream(ris); + + return recordInfo; + } + +} diff --git a/modules/src/main/java/org/archive/modules/warc/WARCRecordBuilder.java b/modules/src/main/java/org/archive/modules/warc/WARCRecordBuilder.java new file mode 100644 index 000000000..a9ec884b1 --- /dev/null +++ b/modules/src/main/java/org/archive/modules/warc/WARCRecordBuilder.java @@ -0,0 +1,72 @@ +package org.archive.modules.warc; + +import static org.archive.modules.CoreAttributeConstants.A_DNS_SERVER_IP_LABEL; + +import java.io.IOException; +import java.net.InetAddress; +import java.net.URI; +import java.net.URISyntaxException; +import java.util.UUID; + +import org.archive.io.warc.WARCRecordInfo; +import org.archive.modules.CrawlURI; +import org.archive.modules.net.CrawlHost; +import org.archive.modules.net.ServerCache; +import org.springframework.beans.factory.annotation.Autowired; + +public abstract class WARCRecordBuilder { + + transient protected ServerCache serverCache; + public ServerCache getServerCache() { + return this.serverCache; + } + @Autowired + public void setServerCache(ServerCache serverCache) { + this.serverCache = serverCache; + } + + public abstract boolean shouldProcess(CrawlURI curi); + public abstract WARCRecordInfo buildRecord(CrawlURI curi, URI concurrentTo) + throws IOException; + + public URI generateRecordID() { + try { + return new URI("urn:uuid:" + UUID.randomUUID()); + } catch (URISyntaxException e) { + throw new RuntimeException(e); // impossible + } + } + + /** + * Return IP address of given URI suitable for recording (as in a + * classic ARC 5-field header line). + * + * @param curi CrawlURI + * @return String of IP address + */ + protected String getHostAddress(CrawlURI curi) { + // special handling for DNS URIs: want address of DNS server + if (curi.getUURI().getScheme().toLowerCase().equals("dns")) { + return (String)curi.getData().get(A_DNS_SERVER_IP_LABEL); + } + // otherwise, host referenced in URI + // TODO:FIXME: have fetcher insert exact IP contacted into curi, + // use that rather than inferred by CrawlHost lookup + CrawlHost h = getServerCache().getHostFor(curi.getUURI()); + if (h == null) { + throw new NullPointerException("Crawlhost is null for " + + curi + " " + curi.getVia()); + } + InetAddress a = h.getIP(); + if (a == null) { + throw new NullPointerException("Address is null for " + + curi + " " + curi.getVia() + ". Address " + + ((h.getIpFetched() == CrawlHost.IP_NEVER_LOOKED_UP)? + "was never looked up.": + (System.currentTimeMillis() - h.getIpFetched()) + + " ms ago.")); + } + return h.getIP().getHostAddress(); + } + +} diff --git a/modules/src/main/java/org/archive/modules/warc/WhoisResponseRecordBuilder.java b/modules/src/main/java/org/archive/modules/warc/WhoisResponseRecordBuilder.java new file mode 100644 index 000000000..cefc1da80 --- /dev/null +++ b/modules/src/main/java/org/archive/modules/warc/WhoisResponseRecordBuilder.java @@ -0,0 +1,48 @@ +package org.archive.modules.warc; + +import static org.archive.format.warc.WARCConstants.HEADER_KEY_IP; + +import java.io.IOException; +import java.net.URI; + +import org.archive.format.warc.WARCConstants.WARCRecordType; +import org.archive.io.ReplayInputStream; +import org.archive.io.warc.WARCRecordInfo; +import org.archive.modules.CoreAttributeConstants; +import org.archive.modules.CrawlURI; +import org.archive.util.ArchiveUtils; + +public class WhoisResponseRecordBuilder extends WARCRecordBuilder { + + @Override + public boolean shouldProcess(CrawlURI curi) { + return "whois".equals(curi.getUURI().getScheme().toLowerCase()); + } + + @Override + public WARCRecordInfo buildRecord(CrawlURI curi, URI concurrentTo) throws IOException { + final String timestamp = + ArchiveUtils.getLog14Date(curi.getFetchBeginTime()); + + WARCRecordInfo recordInfo = new WARCRecordInfo(); + recordInfo.setType(WARCRecordType.response); + recordInfo.setUrl(curi.toString()); + recordInfo.setCreate14DigitDate(timestamp); + recordInfo.setMimetype(curi.getContentType()); + recordInfo.setRecordId(generateRecordID()); + recordInfo.setContentLength(curi.getRecorder().getRecordedInput().getSize()); + recordInfo.setEnforceLength(true); + + Object whoisServerIP = curi.getData().get(CoreAttributeConstants.A_WHOIS_SERVER_IP); + if (whoisServerIP != null) { + recordInfo.addExtraHeader(HEADER_KEY_IP, whoisServerIP.toString()); + } + + ReplayInputStream ris = + curi.getRecorder().getRecordedInput().getReplayInputStream(); + recordInfo.setContentStream(ris); + + return recordInfo; + } + +} diff --git a/modules/src/main/java/org/archive/modules/writer/BaseWARCWriterProcessor.java b/modules/src/main/java/org/archive/modules/writer/BaseWARCWriterProcessor.java new file mode 100644 index 000000000..e9b2c262a --- /dev/null +++ b/modules/src/main/java/org/archive/modules/writer/BaseWARCWriterProcessor.java @@ -0,0 +1,253 @@ +package org.archive.modules.writer; + +import static org.archive.modules.CoreAttributeConstants.A_WARC_STATS; +import static org.archive.modules.recrawl.RecrawlAttributeConstants.A_CONTENT_DIGEST_COUNT; +import static org.archive.modules.recrawl.RecrawlAttributeConstants.A_ORIGINAL_DATE; +import static org.archive.modules.recrawl.RecrawlAttributeConstants.A_ORIGINAL_URL; +import static org.archive.modules.recrawl.RecrawlAttributeConstants.A_WARC_FILENAME; +import static org.archive.modules.recrawl.RecrawlAttributeConstants.A_WARC_FILE_OFFSET; +import static org.archive.modules.recrawl.RecrawlAttributeConstants.A_WARC_RECORD_ID; +import static org.archive.modules.recrawl.RecrawlAttributeConstants.A_WRITE_TAG; + +import java.io.IOException; +import java.net.InetAddress; +import java.net.URI; +import java.net.UnknownHostException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ConcurrentMap; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicLong; +import java.util.logging.Level; +import java.util.logging.Logger; + +import org.apache.commons.lang.StringUtils; +import org.archive.format.warc.WARCConstants.WARCRecordType; +import org.archive.io.warc.WARCRecordInfo; +import org.archive.io.warc.WARCWriter; +import org.archive.io.warc.WARCWriterPool; +import org.archive.io.warc.WARCWriterPoolSettings; +import org.archive.modules.CrawlMetadata; +import org.archive.modules.CrawlURI; +import org.archive.modules.revisit.IdenticalPayloadDigestRevisit; +import org.archive.spring.ConfigPath; +import org.archive.uid.RecordIDGenerator; +import org.archive.uid.UUIDGenerator; +import org.archive.util.ArchiveUtils; +import org.archive.util.anvl.ANVLRecord; + +abstract public class BaseWARCWriterProcessor extends WriterPoolProcessor + implements WARCWriterPoolSettings { + + private static final Logger logger = + Logger.getLogger(BaseWARCWriterProcessor.class.getName()); + + protected AtomicLong urlsWritten = new AtomicLong(); + protected ConcurrentMap> stats = new ConcurrentHashMap>(); + public ConcurrentMap> getStats() { + return stats; + } + + + /** + * Generator for record IDs + */ + protected RecordIDGenerator generator = new UUIDGenerator(); + public RecordIDGenerator getRecordIDGenerator() { + return generator; + } + public void setRecordIDGenerator(RecordIDGenerator generator) { + this.generator = generator; + } + + protected URI getRecordID() throws IOException { + return generator.getRecordID(); + } + + public long getDefaultMaxFileSize() { + return 1000000000L; // 1 SI giga-byte (10^9 bytes), per WARC appendix A + } + + public List getDefaultStorePaths() { + List paths = new ArrayList(); + paths.add(new ConfigPath("warcs default store path", "warcs")); + return paths; + } + + @Override + protected void setupPool(final AtomicInteger serialNo) { + setPool(new WARCWriterPool(serialNo, this, getPoolMaxActive(), getMaxWaitForIdleMs())); + } + + private transient List cachedMetadata; + public List getMetadata() { + if (cachedMetadata != null) { + return cachedMetadata; + } + ANVLRecord record = new ANVLRecord(); + record.addLabelValue("software", "Heritrix/" + + ArchiveUtils.VERSION + " http://crawler.archive.org"); + try { + InetAddress host = InetAddress.getLocalHost(); + record.addLabelValue("ip", host.getHostAddress()); + record.addLabelValue("hostname", host.getCanonicalHostName()); + } catch (UnknownHostException e) { + logger.log(Level.WARNING,"unable top obtain local crawl engine host",e); + } + + // conforms to ISO 28500:2009 as of May 2009 + // as described at http://bibnum.bnf.fr/WARC/ + // latest draft as of November 2008 + record.addLabelValue("format","WARC File Format 1.0"); + record.addLabelValue("conformsTo","http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf"); + + // Get other values from metadata provider + + CrawlMetadata provider = getMetadataProvider(); + + addIfNotBlank(record,"operator", provider.getOperator()); + addIfNotBlank(record,"publisher", provider.getOrganization()); + addIfNotBlank(record,"audience", provider.getAudience()); + addIfNotBlank(record,"isPartOf", provider.getJobName()); + // TODO: make date match 'job creation date' as in Heritrix 1.x + // until then, leave out (plenty of dates already in WARC + // records +// String rawDate = provider.getBeginDate(); +// if(StringUtils.isNotBlank(rawDate)) { +// Date date; +// try { +// date = ArchiveUtils.parse14DigitDate(rawDate); +// addIfNotBlank(record,"created",ArchiveUtils.getLog14Date(date)); +// } catch (ParseException e) { +// logger.log(Level.WARNING,"obtaining warc created date",e); +// } +// } + addIfNotBlank(record,"description", provider.getDescription()); + addIfNotBlank(record,"robots", provider.getRobotsPolicyName().toLowerCase()); + + addIfNotBlank(record,"http-header-user-agent", + provider.getUserAgent()); + addIfNotBlank(record,"http-header-from", + provider.getOperatorFrom()); + + // really ugly to return as List, but changing would require + // larger refactoring + return Collections.singletonList(record.toString()); + } + + protected void addIfNotBlank(ANVLRecord record, String label, String value) { + if(StringUtils.isNotBlank(value)) { + record.addLabelValue(label, value); + } + } + + + protected void addStats(Map> substats) { + for (String key: substats.keySet()) { + // intentionally redundant here -- if statement avoids creating + // unused empty map every time; putIfAbsent() ensures thread safety + if (stats.get(key) == null) { + stats.putIfAbsent(key, new ConcurrentHashMap()); + } + + for (String subkey: substats.get(key).keySet()) { + AtomicLong oldValue = stats.get(key).get(subkey); + if (oldValue == null) { + oldValue = stats.get(key).putIfAbsent(subkey, new AtomicLong(substats.get(key).get(subkey))); + } + if (oldValue != null) { + oldValue.addAndGet(substats.get(key).get(subkey)); + } + } + } + } + + @Override + public String report() { + // XXX note in report that stats include recovered checkpoint? + logger.info("final stats: " + stats); + + StringBuilder buf = new StringBuilder(); + buf.append("Processor: " + getClass().getName() + "\n"); + buf.append(" Function: Writes WARCs\n"); + buf.append(" Total CrawlURIs: " + urlsWritten + "\n"); + buf.append(" Revisit records: " + WARCWriter.getStat(stats, WARCRecordType.revisit.toString(), WARCWriter.NUM_RECORDS) + "\n"); + + long bytes = WARCWriter.getStat(stats, WARCRecordType.response.toString(), WARCWriter.CONTENT_BYTES) + + WARCWriter.getStat(stats, WARCRecordType.resource.toString(), WARCWriter.CONTENT_BYTES); + buf.append(" Crawled content bytes (including http headers): " + + bytes + " (" + ArchiveUtils.formatBytesForDisplay(bytes) + ")\n"); + + bytes = WARCWriter.getStat(stats, WARCWriter.TOTALS, WARCWriter.TOTAL_BYTES); + buf.append(" Total uncompressed bytes (including all warc records): " + + bytes + " (" + ArchiveUtils.formatBytesForDisplay(bytes) + ")\n"); + + buf.append(" Total size on disk ("+ (getCompress() ? "compressed" : "uncompressed") + "): " + + getTotalBytesWritten() + " (" + ArchiveUtils.formatBytesForDisplay(getTotalBytesWritten()) + ")\n"); + + return buf.toString(); + } + + protected Map> copyStats(Map> orig) { + Map> copy = new HashMap>(orig.size()); + for (String k: orig.keySet()) { + copy.put(k, new HashMap(orig.get(k))); + } + return copy; + } + + protected void updateMetadataAfterWrite(final CrawlURI curi, + WARCWriter writer, long startPosition) { + if (WARCWriter.getStat(writer.getTmpStats(), WARCWriter.TOTALS, WARCWriter.NUM_RECORDS) > 0l) { + addStats(writer.getTmpStats()); + urlsWritten.incrementAndGet(); + } + if (logger.isLoggable(Level.FINE)) { + logger.fine("wrote " + + WARCWriter.getStat(writer.getTmpStats(), WARCWriter.TOTALS, WARCWriter.SIZE_ON_DISK) + + " bytes to " + writer.getFile().getName() + " for " + curi); + } + setTotalBytesWritten(getTotalBytesWritten() + (writer.getPosition() - startPosition)); + + curi.addExtraInfo("warcFilename", writer.getFilenameWithoutOccupiedSuffix()); + curi.addExtraInfo("warcFileOffset", startPosition); + + curi.getData().put(A_WARC_STATS, copyStats(writer.getTmpStats())); + + // history for uri-based dedupe + Map[] history = curi.getFetchHistory(); + if (history != null && history[0] != null) { + history[0].put(A_WRITE_TAG, writer.getFilenameWithoutOccupiedSuffix()); + } + + // history for uri-agnostic, content digest based dedupe + if (curi.getContentDigest() != null && curi.hasContentDigestHistory()) { + for (WARCRecordInfo warcRecord: writer.getTmpRecordLog()) { + if ((warcRecord.getType() == WARCRecordType.response + || warcRecord.getType() == WARCRecordType.resource) + && warcRecord.getContentStream() != null + && warcRecord.getContentLength() > 0) { + curi.getContentDigestHistory().put(A_ORIGINAL_URL, warcRecord.getUrl()); + curi.getContentDigestHistory().put(A_WARC_RECORD_ID, warcRecord.getRecordId().toString()); + curi.getContentDigestHistory().put(A_WARC_FILENAME, warcRecord.getWARCFilename()); + curi.getContentDigestHistory().put(A_WARC_FILE_OFFSET, warcRecord.getWARCFileOffset()); + curi.getContentDigestHistory().put(A_ORIGINAL_DATE, warcRecord.getCreate14DigitDate()); + curi.getContentDigestHistory().put(A_CONTENT_DIGEST_COUNT, 1); + } else if (warcRecord.getType() == WARCRecordType.revisit + && curi.getRevisitProfile() instanceof IdenticalPayloadDigestRevisit) { + Integer oldCount = (Integer) curi.getContentDigestHistory().get(A_CONTENT_DIGEST_COUNT); + if (oldCount == null) { + // shouldn't happen, log a warning? + oldCount = 1; + } + curi.getContentDigestHistory().put(A_CONTENT_DIGEST_COUNT, oldCount + 1); + } + } + } + } + +} diff --git a/modules/src/main/java/org/archive/modules/writer/WARCWriterChainProcessor.java b/modules/src/main/java/org/archive/modules/writer/WARCWriterChainProcessor.java new file mode 100644 index 000000000..eca530b7c --- /dev/null +++ b/modules/src/main/java/org/archive/modules/writer/WARCWriterChainProcessor.java @@ -0,0 +1,126 @@ +package org.archive.modules.writer; + +import java.io.IOException; +import java.net.URI; +import java.util.ArrayList; +import java.util.List; +import java.util.logging.Level; +import java.util.logging.Logger; + +import org.archive.io.warc.WARCRecordInfo; +import org.archive.io.warc.WARCWriter; +import org.archive.modules.CrawlURI; +import org.archive.modules.ProcessResult; +import org.archive.modules.deciderules.recrawl.IdenticalDigestDecideRule; +import org.archive.modules.warc.WARCRecordBuilder; +import org.archive.spring.HasKeyedProperties; + +public class WARCWriterChainProcessor extends BaseWARCWriterProcessor implements HasKeyedProperties { + private static final Logger logger = + Logger.getLogger(WARCWriterChainProcessor.class.getName()); + + { + setChain(new ArrayList()); + } + @SuppressWarnings("unchecked") + public List getChain() { + return (List) kp.get("chain"); + } + public void setChain(List chain) { + kp.put("chain", chain); + } + + @Override + protected boolean shouldWrite(CrawlURI curi) { + if (getSkipIdenticalDigests() + && IdenticalDigestDecideRule.hasIdenticalDigest(curi)) { + curi.getAnnotations().add(ANNOTATION_UNWRITTEN + + ":identicalDigest"); + return false; + } + + // WARCWriterProcessor has seemingly unnecessarily complicated logic + if (curi.getFetchStatus() <= 0) { + curi.getAnnotations().add(ANNOTATION_UNWRITTEN + ":status"); + return false; + } + + return true; + } + + @Override + protected ProcessResult innerProcessResult(CrawlURI curi) { + try { + if (shouldWrite(curi)) { + return write(curi); + } else { + copyForwardWriteTagIfDupe(curi); + } + } catch (IOException e) { + curi.getNonFatalFailures().add(e); + logger.log(Level.SEVERE, "Failed write of Records: " + + curi.toString(), e); + } + return ProcessResult.PROCEED; + } + + protected ProcessResult write(final CrawlURI curi) + throws IOException { + WARCWriter writer = (WARCWriter) getPool().borrowFile(); + + // Reset writer temp stats so they reflect only this set of records. + writer.resetTmpStats(); + writer.resetTmpRecordLog(); + + long position = writer.getPosition(); + try { + // Roll over to new warc file if we've exceeded maxBytes. + writer.checkSize(); + if (writer.getPosition() != position) { + // We rolled over to a new warc and wrote a warcinfo record. + // Tally stats and reset temp stats, to avoid including warcinfo + // record in stats for current url. + setTotalBytesWritten(getTotalBytesWritten() + + (writer.getPosition() - position)); + addStats(writer.getTmpStats()); + writer.resetTmpStats(); + writer.resetTmpRecordLog(); + + position = writer.getPosition(); + } + + writeRecords(curi, writer); + } catch (IOException e) { + // Invalidate this file (It gets a '.invalid' suffix). + getPool().invalidateFile(writer); + // Set the writer to null otherwise the pool accounting + // of how many active writers gets skewed if we subsequently + // do a returnWriter call on this object in the finally block. + writer = null; + throw e; + } finally { + if (writer != null) { + updateMetadataAfterWrite(curi, writer, position); + getPool().returnFile(writer); + } + } + // XXX this looks wrong, check should happen *before* writing the + // record, the way checkBytesWritten() currently works + return checkBytesWritten(); + } + + protected void writeRecords(CrawlURI curi, WARCWriter writer) throws IOException { + URI concurrentTo = null; + for (WARCRecordBuilder recordBuilder: getChain()) { + if (recordBuilder.shouldProcess(curi)) { + WARCRecordInfo record = recordBuilder.buildRecord(curi, concurrentTo); + if (record != null) { + writer.writeRecord(record); + if (concurrentTo == null) { + concurrentTo = record.getRecordId(); + } + } + } + } + } +} diff --git a/modules/src/main/java/org/archive/modules/writer/WARCWriterProcessor.java b/modules/src/main/java/org/archive/modules/writer/WARCWriterProcessor.java index b17d1d54e..4726a0079 100644 --- a/modules/src/main/java/org/archive/modules/writer/WARCWriterProcessor.java +++ b/modules/src/main/java/org/archive/modules/writer/WARCWriterProcessor.java @@ -37,33 +37,16 @@ import static org.archive.modules.CoreAttributeConstants.A_FTP_FETCH_STATUS; import static org.archive.modules.CoreAttributeConstants.A_SOURCE_TAG; import static org.archive.modules.CoreAttributeConstants.A_WARC_RESPONSE_HEADERS; -import static org.archive.modules.CoreAttributeConstants.A_WARC_STATS; import static org.archive.modules.CoreAttributeConstants.HEADER_TRUNC; import static org.archive.modules.CoreAttributeConstants.LENGTH_TRUNC; import static org.archive.modules.CoreAttributeConstants.TIMER_TRUNC; -import static org.archive.modules.recrawl.RecrawlAttributeConstants.A_CONTENT_DIGEST_COUNT; -import static org.archive.modules.recrawl.RecrawlAttributeConstants.A_ORIGINAL_DATE; -import static org.archive.modules.recrawl.RecrawlAttributeConstants.A_ORIGINAL_URL; -import static org.archive.modules.recrawl.RecrawlAttributeConstants.A_WARC_FILENAME; -import static org.archive.modules.recrawl.RecrawlAttributeConstants.A_WARC_FILE_OFFSET; -import static org.archive.modules.recrawl.RecrawlAttributeConstants.A_WARC_RECORD_ID; -import static org.archive.modules.recrawl.RecrawlAttributeConstants.A_WRITE_TAG; import java.io.ByteArrayInputStream; import java.io.IOException; -import java.net.InetAddress; import java.net.URI; -import java.net.UnknownHostException; -import java.util.ArrayList; import java.util.Collection; -import java.util.Collections; import java.util.HashMap; -import java.util.List; import java.util.Map; -import java.util.concurrent.ConcurrentHashMap; -import java.util.concurrent.ConcurrentMap; -import java.util.concurrent.atomic.AtomicInteger; -import java.util.concurrent.atomic.AtomicLong; import java.util.logging.Level; import java.util.logging.Logger; @@ -73,17 +56,11 @@ import org.archive.io.ReplayInputStream; import org.archive.io.warc.WARCRecordInfo; import org.archive.io.warc.WARCWriter; -import org.archive.io.warc.WARCWriterPool; import org.archive.io.warc.WARCWriterPoolSettings; import org.archive.modules.CoreAttributeConstants; -import org.archive.modules.CrawlMetadata; import org.archive.modules.CrawlURI; import org.archive.modules.ProcessResult; -import org.archive.modules.revisit.IdenticalPayloadDigestRevisit; import org.archive.modules.revisit.RevisitProfile; -import org.archive.spring.ConfigPath; -import org.archive.uid.RecordIDGenerator; -import org.archive.uid.UUIDGenerator; import org.archive.util.ArchiveUtils; import org.archive.util.anvl.ANVLRecord; import org.json.JSONException; @@ -97,29 +74,16 @@ * (commons-httpclient?) or find something else. * * @author stack + * + * @deprecated in favor of {@link WARCWriterChainProcessor} */ -public class WARCWriterProcessor extends WriterPoolProcessor implements WARCWriterPoolSettings { +@Deprecated +public class WARCWriterProcessor extends BaseWARCWriterProcessor implements WARCWriterPoolSettings { @SuppressWarnings("unused") private static final long serialVersionUID = 6182850087635847443L; private static final Logger logger = Logger.getLogger(WARCWriterProcessor.class.getName()); - private ConcurrentMap> stats = new ConcurrentHashMap>(); - public ConcurrentMap> getStats() { - return stats; - } - - private AtomicLong urlsWritten = new AtomicLong(); - - public long getDefaultMaxFileSize() { - return 1000000000L; // 1 SI giga-byte (10^9 bytes), per WARC appendix A - } - public List getDefaultStorePaths() { - List paths = new ArrayList(); - paths.add(new ConfigPath("warcs default store path", "warcs")); - return paths; - } - /** * Whether to write 'request' type records. Default is true. */ @@ -145,17 +109,6 @@ public boolean getWriteMetadata() { public void setWriteMetadata(boolean writeMetadata) { kp.put("writeMetadata",writeMetadata); } - - /** - * Generator for record IDs - */ - protected RecordIDGenerator generator = new UUIDGenerator(); - public RecordIDGenerator getRecordIDGenerator() { - return generator; - } - public void setRecordIDGenerator(RecordIDGenerator generator) { - this.generator = generator; - } @Deprecated public void setWriteRevisitForIdenticalDigests(boolean writeRevisits) { @@ -167,16 +120,9 @@ public void setWriteRevisitForNotModified(boolean writeRevisits) { logger.warning("setting writeRevisitForNotModified is deprecated, value ignored"); } - private transient List cachedMetadata; - public WARCWriterProcessor() { } - @Override - protected void setupPool(final AtomicInteger serialNo) { - setPool(new WARCWriterPool(serialNo, this, getPoolMaxActive(), getMaxWaitForIdleMs())); - } - /** * Writes a CrawlURI and its associated data to store file. * @@ -264,84 +210,6 @@ protected ProcessResult write(final String lowerCaseScheme, return checkBytesWritten(); } - protected Map> copyStats(Map> orig) { - Map> copy = new HashMap>(orig.size()); - for (String k: orig.keySet()) { - copy.put(k, new HashMap(orig.get(k))); - } - return copy; - } - - protected void updateMetadataAfterWrite(final CrawlURI curi, - WARCWriter writer, long startPosition) { - if (WARCWriter.getStat(writer.getTmpStats(), WARCWriter.TOTALS, WARCWriter.NUM_RECORDS) > 0l) { - addStats(writer.getTmpStats()); - urlsWritten.incrementAndGet(); - } - if (logger.isLoggable(Level.FINE)) { - logger.fine("wrote " - + WARCWriter.getStat(writer.getTmpStats(), WARCWriter.TOTALS, WARCWriter.SIZE_ON_DISK) - + " bytes to " + writer.getFile().getName() + " for " + curi); - } - setTotalBytesWritten(getTotalBytesWritten() + (writer.getPosition() - startPosition)); - - curi.addExtraInfo("warcFilename", writer.getFilenameWithoutOccupiedSuffix()); - curi.addExtraInfo("warcFileOffset", startPosition); - - curi.getData().put(A_WARC_STATS, copyStats(writer.getTmpStats())); - - // history for uri-based dedupe - Map[] history = curi.getFetchHistory(); - if (history != null && history[0] != null) { - history[0].put(A_WRITE_TAG, writer.getFilenameWithoutOccupiedSuffix()); - } - - // history for uri-agnostic, content digest based dedupe - if (curi.getContentDigest() != null && curi.hasContentDigestHistory()) { - for (WARCRecordInfo warcRecord: writer.getTmpRecordLog()) { - if ((warcRecord.getType() == WARCRecordType.response - || warcRecord.getType() == WARCRecordType.resource) - && warcRecord.getContentStream() != null - && warcRecord.getContentLength() > 0) { - curi.getContentDigestHistory().put(A_ORIGINAL_URL, warcRecord.getUrl()); - curi.getContentDigestHistory().put(A_WARC_RECORD_ID, warcRecord.getRecordId().toString()); - curi.getContentDigestHistory().put(A_WARC_FILENAME, warcRecord.getWARCFilename()); - curi.getContentDigestHistory().put(A_WARC_FILE_OFFSET, warcRecord.getWARCFileOffset()); - curi.getContentDigestHistory().put(A_ORIGINAL_DATE, warcRecord.getCreate14DigitDate()); - curi.getContentDigestHistory().put(A_CONTENT_DIGEST_COUNT, 1); - } else if (warcRecord.getType() == WARCRecordType.revisit - && curi.getRevisitProfile() instanceof IdenticalPayloadDigestRevisit) { - Integer oldCount = (Integer) curi.getContentDigestHistory().get(A_CONTENT_DIGEST_COUNT); - if (oldCount == null) { - // shouldn't happen, log a warning? - oldCount = 1; - } - curi.getContentDigestHistory().put(A_CONTENT_DIGEST_COUNT, oldCount + 1); - } - } - } - } - - protected void addStats(Map> substats) { - for (String key: substats.keySet()) { - // intentionally redundant here -- if statement avoids creating - // unused empty map every time; putIfAbsent() ensures thread safety - if (stats.get(key) == null) { - stats.putIfAbsent(key, new ConcurrentHashMap()); - } - - for (String subkey: substats.get(key).keySet()) { - AtomicLong oldValue = stats.get(key).get(subkey); - if (oldValue == null) { - oldValue = stats.get(key).putIfAbsent(subkey, new AtomicLong(substats.get(key).get(subkey))); - } - if (oldValue != null) { - oldValue.addAndGet(substats.get(key).get(subkey)); - } - } - } - } - protected void writeDnsRecords(final CrawlURI curi, WARCWriter w, final URI baseid, final String timestamp) throws IOException { WARCRecordInfo recordInfo = new WARCRecordInfo(); @@ -767,10 +635,6 @@ protected URI writeMetadata(final WARCWriter w, return recordInfo.getRecordId(); } - protected URI getRecordID() throws IOException { - return generator.getRecordID(); - } - protected URI qualifyRecordID(final URI base, final String key, final String value) throws IOException { @@ -778,67 +642,6 @@ protected URI qualifyRecordID(final URI base, final String key, qualifiers.put(key, value); return generator.qualifyRecordID(base, qualifiers); } - - public List getMetadata() { - if (cachedMetadata != null) { - return cachedMetadata; - } - ANVLRecord record = new ANVLRecord(); - record.addLabelValue("software", "Heritrix/" + - ArchiveUtils.VERSION + " http://crawler.archive.org"); - try { - InetAddress host = InetAddress.getLocalHost(); - record.addLabelValue("ip", host.getHostAddress()); - record.addLabelValue("hostname", host.getCanonicalHostName()); - } catch (UnknownHostException e) { - logger.log(Level.WARNING,"unable top obtain local crawl engine host",e); - } - - // conforms to ISO 28500:2009 as of May 2009 - // as described at http://bibnum.bnf.fr/WARC/ - // latest draft as of November 2008 - record.addLabelValue("format","WARC File Format 1.0"); - record.addLabelValue("conformsTo","http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf"); - - // Get other values from metadata provider - - CrawlMetadata provider = getMetadataProvider(); - - addIfNotBlank(record,"operator", provider.getOperator()); - addIfNotBlank(record,"publisher", provider.getOrganization()); - addIfNotBlank(record,"audience", provider.getAudience()); - addIfNotBlank(record,"isPartOf", provider.getJobName()); - // TODO: make date match 'job creation date' as in Heritrix 1.x - // until then, leave out (plenty of dates already in WARC - // records -// String rawDate = provider.getBeginDate(); -// if(StringUtils.isNotBlank(rawDate)) { -// Date date; -// try { -// date = ArchiveUtils.parse14DigitDate(rawDate); -// addIfNotBlank(record,"created",ArchiveUtils.getLog14Date(date)); -// } catch (ParseException e) { -// logger.log(Level.WARNING,"obtaining warc created date",e); -// } -// } - addIfNotBlank(record,"description", provider.getDescription()); - addIfNotBlank(record,"robots", provider.getRobotsPolicyName().toLowerCase()); - - addIfNotBlank(record,"http-header-user-agent", - provider.getUserAgent()); - addIfNotBlank(record,"http-header-from", - provider.getOperatorFrom()); - - // really ugly to return as List, but changing would require - // larger refactoring - return Collections.singletonList(record.toString()); - } - - protected void addIfNotBlank(ANVLRecord record, String label, String value) { - if(StringUtils.isNotBlank(value)) { - record.addLabelValue(label, value); - } - } @Override protected JSONObject toCheckpointJson() throws JSONException { @@ -878,31 +681,5 @@ protected void fromCheckpointJson(JSONObject json) throws JSONException { } } } - - @Override - public String report() { - // XXX note in report that stats include recovered checkpoint? - logger.info("final stats: " + stats); - - StringBuilder buf = new StringBuilder(); - buf.append("Processor: " + getClass().getName() + "\n"); - buf.append(" Function: Writes WARCs\n"); - buf.append(" Total CrawlURIs: " + urlsWritten + "\n"); - buf.append(" Revisit records: " + WARCWriter.getStat(stats, WARCRecordType.revisit.toString(), WARCWriter.NUM_RECORDS) + "\n"); - - long bytes = WARCWriter.getStat(stats, WARCRecordType.response.toString(), WARCWriter.CONTENT_BYTES) - + WARCWriter.getStat(stats, WARCRecordType.resource.toString(), WARCWriter.CONTENT_BYTES); - buf.append(" Crawled content bytes (including http headers): " - + bytes + " (" + ArchiveUtils.formatBytesForDisplay(bytes) + ")\n"); - - bytes = WARCWriter.getStat(stats, WARCWriter.TOTALS, WARCWriter.TOTAL_BYTES); - buf.append(" Total uncompressed bytes (including all warc records): " - + bytes + " (" + ArchiveUtils.formatBytesForDisplay(bytes) + ")\n"); - - buf.append(" Total size on disk ("+ (getCompress() ? "compressed" : "uncompressed") + "): " - + getTotalBytesWritten() + " (" + ArchiveUtils.formatBytesForDisplay(getTotalBytesWritten()) + ")\n"); - - return buf.toString(); - } } diff --git a/modules/src/main/java/org/archive/modules/writer/WriterPoolProcessor.java b/modules/src/main/java/org/archive/modules/writer/WriterPoolProcessor.java index c24423eb9..12a9cabd0 100644 --- a/modules/src/main/java/org/archive/modules/writer/WriterPoolProcessor.java +++ b/modules/src/main/java/org/archive/modules/writer/WriterPoolProcessor.java @@ -45,6 +45,7 @@ import org.archive.modules.deciderules.recrawl.IdenticalDigestDecideRule; import org.archive.modules.net.CrawlHost; import org.archive.modules.net.ServerCache; +import org.archive.modules.warc.WARCRecordBuilder; import org.archive.spring.ConfigPath; import org.archive.util.FileUtils; import org.json.JSONException; @@ -370,7 +371,10 @@ protected boolean shouldWrite(CrawlURI curi) { * * @param curi CrawlURI * @return String of IP address + * + * @deprecated WARCRecordBuilder instances use {@link WARCRecordBuilder#getHostAddress(CrawlURI)} */ + @Deprecated protected String getHostAddress(CrawlURI curi) { // special handling for DNS URIs: want address of DNS server if (curi.getUURI().getScheme().toLowerCase().equals("dns")) { From 870d84740a942c724cc827407b96efbc48cc027d Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 12 Jun 2019 11:04:02 -0700 Subject: [PATCH 005/123] same test as for WARCWriterProcessor doesn't test that much though --- .../writer/WARCWriterChainProcessorTest.java | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 modules/src/test/java/org/archive/modules/writer/WARCWriterChainProcessorTest.java diff --git a/modules/src/test/java/org/archive/modules/writer/WARCWriterChainProcessorTest.java b/modules/src/test/java/org/archive/modules/writer/WARCWriterChainProcessorTest.java new file mode 100644 index 000000000..69fcf9921 --- /dev/null +++ b/modules/src/test/java/org/archive/modules/writer/WARCWriterChainProcessorTest.java @@ -0,0 +1,28 @@ +package org.archive.modules.writer; + +import java.io.File; + +import org.archive.modules.CrawlMetadata; +import org.archive.modules.fetcher.DefaultServerCache; +import org.archive.spring.ConfigPath; +import org.archive.util.FileUtils; +import org.archive.util.TmpDirTestCase; + +public class WARCWriterChainProcessorTest extends WARCWriterProcessorTest { + + @Override + protected Object makeModule() throws Exception { + File tmp = TmpDirTestCase.tmpDir(); + tmp = new File(tmp, getClass().getSimpleName()); + FileUtils.ensureWriteableDirectory(tmp); + + WARCWriterChainProcessor result = new WARCWriterChainProcessor(); + result.setDirectory(new ConfigPath("test", tmp.getAbsolutePath())); + result.setServerCache(new DefaultServerCache()); + CrawlMetadata metadata = new CrawlMetadata(); + metadata.afterPropertiesSet(); + result.setMetadataProvider(metadata); + result.start(); + return result; + } +} From ece435874e57807f776924e21cb52079c4520cae Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 12 Jun 2019 12:53:27 -0700 Subject: [PATCH 006/123] oops, handle https too --- .../org/archive/modules/warc/HttpResponseRecordBuilder.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/src/main/java/org/archive/modules/warc/HttpResponseRecordBuilder.java b/modules/src/main/java/org/archive/modules/warc/HttpResponseRecordBuilder.java index 0f83f5e9a..b6f69e885 100644 --- a/modules/src/main/java/org/archive/modules/warc/HttpResponseRecordBuilder.java +++ b/modules/src/main/java/org/archive/modules/warc/HttpResponseRecordBuilder.java @@ -27,7 +27,7 @@ public class HttpResponseRecordBuilder extends WARCRecordBuilder { @Override public boolean shouldProcess(CrawlURI curi) { return !curi.isRevisit() - && "http".equals(curi.getUURI().getScheme().toLowerCase()); + && curi.getUURI().getScheme().toLowerCase().startsWith("http"); } @Override From 8c4c443a88f410b46f811a98e24760a06d91f4b6 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 12 Jun 2019 12:54:00 -0700 Subject: [PATCH 007/123] default chain in code --- .../restlet/profile-crawler-beans.cxml | 2 ++ .../writer/WARCWriterChainProcessor.java | 20 +++++++++++++++++-- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/engine/src/main/resources/org/archive/crawler/restlet/profile-crawler-beans.cxml b/engine/src/main/resources/org/archive/crawler/restlet/profile-crawler-beans.cxml index 96106c803..d3d02f3fc 100644 --- a/engine/src/main/resources/org/archive/crawler/restlet/profile-crawler-beans.cxml +++ b/engine/src/main/resources/org/archive/crawler/restlet/profile-crawler-beans.cxml @@ -350,6 +350,7 @@ http://example.example/example --> + diff --git a/modules/src/main/java/org/archive/modules/writer/WARCWriterChainProcessor.java b/modules/src/main/java/org/archive/modules/writer/WARCWriterChainProcessor.java index eca530b7c..84e1b6f36 100644 --- a/modules/src/main/java/org/archive/modules/writer/WARCWriterChainProcessor.java +++ b/modules/src/main/java/org/archive/modules/writer/WARCWriterChainProcessor.java @@ -2,7 +2,7 @@ import java.io.IOException; import java.net.URI; -import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import java.util.logging.Level; import java.util.logging.Logger; @@ -12,7 +12,15 @@ import org.archive.modules.CrawlURI; import org.archive.modules.ProcessResult; import org.archive.modules.deciderules.recrawl.IdenticalDigestDecideRule; +import org.archive.modules.warc.DnsResponseRecordBuilder; +import org.archive.modules.warc.FtpControlConversationRecordBuilder; +import org.archive.modules.warc.FtpResponseRecordBuilder; +import org.archive.modules.warc.HttpRequestRecordBuilder; +import org.archive.modules.warc.HttpResponseRecordBuilder; +import org.archive.modules.warc.MetadataRecordBuilder; +import org.archive.modules.warc.RevisitRecordBuilder; import org.archive.modules.warc.WARCRecordBuilder; +import org.archive.modules.warc.WhoisResponseRecordBuilder; import org.archive.spring.HasKeyedProperties; public class WARCWriterChainProcessor extends BaseWARCWriterProcessor implements HasKeyedProperties { @@ -20,7 +28,15 @@ public class WARCWriterChainProcessor extends BaseWARCWriterProcessor implements Logger.getLogger(WARCWriterChainProcessor.class.getName()); { - setChain(new ArrayList()); + setChain(Arrays.asList( + new DnsResponseRecordBuilder(), + new HttpResponseRecordBuilder(), + new WhoisResponseRecordBuilder(), + new FtpControlConversationRecordBuilder(), + new FtpResponseRecordBuilder(), + new RevisitRecordBuilder(), + new HttpRequestRecordBuilder(), + new MetadataRecordBuilder())); } @SuppressWarnings("unchecked") public List getChain() { From 7c31b0752ac1d08fdcea4149d6bb3ae17a3f5b31 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 12 Jun 2019 12:56:49 -0700 Subject: [PATCH 008/123] use WARCWriterChainProcessor for these tests --- .../archive/crawler/selftest/StatisticsSelfTest.java | 2 +- .../modules/recrawl/ContentDigestHistoryTest.java | 6 +++--- .../modules/writer/WARCWriterChainProcessorTest.java | 11 +++++++++-- 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/engine/src/test/java/org/archive/crawler/selftest/StatisticsSelfTest.java b/engine/src/test/java/org/archive/crawler/selftest/StatisticsSelfTest.java index 8d880bf34..3e66a064a 100644 --- a/engine/src/test/java/org/archive/crawler/selftest/StatisticsSelfTest.java +++ b/engine/src/test/java/org/archive/crawler/selftest/StatisticsSelfTest.java @@ -25,7 +25,7 @@ public class StatisticsSelfTest extends SelfTestBase { @Override protected String changeGlobalConfig(String config) { - String warcWriterConfig = " \n"; + String warcWriterConfig = " \n"; config = config.replace("", warcWriterConfig); return super.changeGlobalConfig(config); } diff --git a/modules/src/test/java/org/archive/modules/recrawl/ContentDigestHistoryTest.java b/modules/src/test/java/org/archive/modules/recrawl/ContentDigestHistoryTest.java index ea2980e06..049a0610c 100644 --- a/modules/src/test/java/org/archive/modules/recrawl/ContentDigestHistoryTest.java +++ b/modules/src/test/java/org/archive/modules/recrawl/ContentDigestHistoryTest.java @@ -62,8 +62,8 @@ import org.archive.modules.CrawlURI; import org.archive.modules.fetcher.FetchHTTP; import org.archive.modules.fetcher.FetchHTTPTests; -import org.archive.modules.writer.WARCWriterProcessor; -import org.archive.modules.writer.WARCWriterProcessorTest; +import org.archive.modules.writer.WARCWriterChainProcessor; +import org.archive.modules.writer.WARCWriterChainProcessorTest; import org.archive.net.UURI; import org.archive.net.UURIFactory; import org.archive.spring.ConfigPath; @@ -218,7 +218,7 @@ public void testWarcDedupe() throws Exception { Server server = newHttpServer(); FetchHTTP fetcher = FetchHTTPTests.newTestFetchHttp(getClass().getName()); - WARCWriterProcessor warcWriter = WARCWriterProcessorTest.newTestWarcWriter(getClass().getName()); + WARCWriterChainProcessor warcWriter = WARCWriterChainProcessorTest.makeTestWARCWriterChainProcessor(); warcWriter.setServerCache(fetcher.getServerCache()); for (File dir: warcWriter.calcOutputDirs()) { /* make sure we don't have other stuff hanging around that will diff --git a/modules/src/test/java/org/archive/modules/writer/WARCWriterChainProcessorTest.java b/modules/src/test/java/org/archive/modules/writer/WARCWriterChainProcessorTest.java index 69fcf9921..37a232b7c 100644 --- a/modules/src/test/java/org/archive/modules/writer/WARCWriterChainProcessorTest.java +++ b/modules/src/test/java/org/archive/modules/writer/WARCWriterChainProcessorTest.java @@ -1,6 +1,7 @@ package org.archive.modules.writer; import java.io.File; +import java.io.IOException; import org.archive.modules.CrawlMetadata; import org.archive.modules.fetcher.DefaultServerCache; @@ -12,8 +13,15 @@ public class WARCWriterChainProcessorTest extends WARCWriterProcessorTest { @Override protected Object makeModule() throws Exception { + WARCWriterChainProcessor result = makeTestWARCWriterChainProcessor(); + result.start(); + return result; + } + + public static WARCWriterChainProcessor makeTestWARCWriterChainProcessor() + throws IOException { File tmp = TmpDirTestCase.tmpDir(); - tmp = new File(tmp, getClass().getSimpleName()); + tmp = new File(tmp, WARCWriterChainProcessorTest.class.getSimpleName()); FileUtils.ensureWriteableDirectory(tmp); WARCWriterChainProcessor result = new WARCWriterChainProcessor(); @@ -22,7 +30,6 @@ protected Object makeModule() throws Exception { CrawlMetadata metadata = new CrawlMetadata(); metadata.afterPropertiesSet(); result.setMetadataProvider(metadata); - result.start(); return result; } } From e928c218e7fbf1158707d912fa6853e2e026d647 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 12 Jun 2019 12:57:27 -0700 Subject: [PATCH 009/123] accept any BaseWARCWriterProcessor --- .../reporting/XmlCrawlSummaryReport.java | 4 +-- .../postprocessor/WARCLimitEnforcer.java | 25 ++++++++++--------- .../recrawl/TroughContentDigestHistory.java | 4 +-- 3 files changed, 17 insertions(+), 16 deletions(-) diff --git a/contrib/src/main/java/org/archive/crawler/reporting/XmlCrawlSummaryReport.java b/contrib/src/main/java/org/archive/crawler/reporting/XmlCrawlSummaryReport.java index 4167fe3c8..81bbd3a2c 100644 --- a/contrib/src/main/java/org/archive/crawler/reporting/XmlCrawlSummaryReport.java +++ b/contrib/src/main/java/org/archive/crawler/reporting/XmlCrawlSummaryReport.java @@ -6,7 +6,7 @@ import java.util.Map; import org.archive.crawler.restlet.XmlMarshaller; -import org.archive.modules.writer.WARCWriterProcessor; +import org.archive.modules.writer.BaseWARCWriterProcessor; import org.archive.util.ArchiveUtils; public class XmlCrawlSummaryReport extends Report { @@ -28,7 +28,7 @@ public void write(PrintWriter writer, StatisticsTracker stats) { CrawlStatSnapshot snapshot = stats.getLastSnapshot(); info.put("crawlName", - ((WARCWriterProcessor) stats.appCtx.getBean("warcWriter")).getPrefix()); + ((BaseWARCWriterProcessor) stats.appCtx.getBean("warcWriter")).getPrefix()); info.put("crawlJobShortName", stats.getCrawlController().getMetadata().getJobName()); info.put("scheduledDate", this.scheduledDate); diff --git a/contrib/src/main/java/org/archive/modules/postprocessor/WARCLimitEnforcer.java b/contrib/src/main/java/org/archive/modules/postprocessor/WARCLimitEnforcer.java index bd29362fa..fcd3055e4 100644 --- a/contrib/src/main/java/org/archive/modules/postprocessor/WARCLimitEnforcer.java +++ b/contrib/src/main/java/org/archive/modules/postprocessor/WARCLimitEnforcer.java @@ -18,17 +18,18 @@ */ package org.archive.modules.postprocessor; +import java.util.ArrayList; import java.util.HashMap; +import java.util.List; import java.util.Map; import java.util.concurrent.atomic.AtomicLong; import java.util.logging.Logger; -import java.util.ArrayList; -import java.util.List; + import org.archive.crawler.framework.CrawlController; import org.archive.crawler.framework.CrawlStatus; import org.archive.modules.CrawlURI; import org.archive.modules.Processor; -import org.archive.modules.writer.WARCWriterProcessor; +import org.archive.modules.writer.BaseWARCWriterProcessor; import org.springframework.beans.factory.annotation.Autowired; public class WARCLimitEnforcer extends Processor { @@ -38,7 +39,7 @@ public class WARCLimitEnforcer extends Processor { protected Map> limits = new HashMap>(); /** - * Should match structure of {@link WARCWriterProcessor#getStats()} + * Should match structure of {@link BaseWARCWriterProcessor#getStats()} * @param limits */ public void setLimits(Map> limits) { @@ -48,23 +49,23 @@ public Map> getLimits() { return limits; } - protected WARCWriterProcessor warcWriter; + protected BaseWARCWriterProcessor warcWriter; @Autowired - public void setWarcWriter(WARCWriterProcessor warcWriter) { + public void setWarcWriter(BaseWARCWriterProcessor warcWriter) { this.warcWriter = warcWriter; } - public WARCWriterProcessor getWarcWriter() { + public BaseWARCWriterProcessor getWarcWriter() { return warcWriter; } { - setWarcWriters(new ArrayList()); + setWarcWriters(new ArrayList()); } @SuppressWarnings("unchecked") - public List getWarcWriters() { - return (List) kp.get("warcWriters"); + public List getWarcWriters() { + return (List) kp.get("warcWriters"); } - public void setWarcWriters(List warcWriters) { + public void setWarcWriters(List warcWriters) { kp.put("warcWriters", warcWriters); } @@ -91,7 +92,7 @@ protected void innerProcess(CrawlURI uri) throws InterruptedException { AtomicLong value = null; if(getWarcWriters() !=null && getWarcWriters().size()>0) { value = new AtomicLong(0); - for (WARCWriterProcessor w: getWarcWriters()) { + for (BaseWARCWriterProcessor w: getWarcWriters()) { Map valueBucket = w.getStats().get(j); if(valueBucket != null) { value.set(value.addAndGet(valueBucket.get(k).get())); diff --git a/contrib/src/main/java/org/archive/modules/recrawl/TroughContentDigestHistory.java b/contrib/src/main/java/org/archive/modules/recrawl/TroughContentDigestHistory.java index 721efc964..73f5e1a16 100644 --- a/contrib/src/main/java/org/archive/modules/recrawl/TroughContentDigestHistory.java +++ b/contrib/src/main/java/org/archive/modules/recrawl/TroughContentDigestHistory.java @@ -17,7 +17,7 @@ import org.archive.crawler.event.CrawlStateEvent; import org.archive.modules.CrawlURI; -import org.archive.modules.writer.WARCWriterProcessor; +import org.archive.modules.writer.WARCWriterChainProcessor; import org.archive.spring.HasKeyedProperties; import org.archive.spring.KeyedProperties; import org.archive.trough.TroughClient; @@ -31,7 +31,7 @@ *

To use, define a {@code TroughContentDigestHistory} top-level bean in your * crawler-beans.cxml, then add {@link ContentDigestHistoryLoader} and * {@link ContentDigestHistoryStorer} to your fetch chain, sandwiching the - * {@link WARCWriterProcessor}. In other words, follow the directions at + * {@link WARCWriterChainProcessor}. In other words, follow the directions at * https://github.com/internetarchive/heritrix3/wiki/Duplication%20Reduction%20Processors * but replace the {@link BdbContentDigestHistory} bean with a * {@code TroughContentDigestHistory} bean. From 9e67a8dab4ba509ef5d51f5516ed6cab2323c755 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 12 Jun 2019 14:58:11 -0700 Subject: [PATCH 010/123] revisits only for http and ftp fixes NPE trying to write a dns revisit record --- .../java/org/archive/modules/warc/RevisitRecordBuilder.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/modules/src/main/java/org/archive/modules/warc/RevisitRecordBuilder.java b/modules/src/main/java/org/archive/modules/warc/RevisitRecordBuilder.java index 525fd8e94..bbed451a2 100644 --- a/modules/src/main/java/org/archive/modules/warc/RevisitRecordBuilder.java +++ b/modules/src/main/java/org/archive/modules/warc/RevisitRecordBuilder.java @@ -22,7 +22,9 @@ public class RevisitRecordBuilder extends WARCRecordBuilder { @Override public boolean shouldProcess(CrawlURI curi) { - return curi.isRevisit(); + String scheme = curi.getUURI().getScheme().toLowerCase(); + return curi.isRevisit() + && (scheme.startsWith("http") || scheme.equals("ftp")); } @Override From 9ddd281ef28301959573bf2df4e37537f5868998 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 12 Jun 2019 15:04:39 -0700 Subject: [PATCH 011/123] make WARCRecordBuilder an interface this way other classes that extend other classes can also implement WARCRecordBuilder --- .../modules/warc/BaseWARCRecordBuilder.java | 66 +++++++++++++++++++ .../warc/DnsResponseRecordBuilder.java | 2 +- .../FtpControlConversationRecordBuilder.java | 2 +- .../warc/FtpResponseRecordBuilder.java | 2 +- .../warc/HttpRequestRecordBuilder.java | 2 +- .../warc/HttpResponseRecordBuilder.java | 2 +- .../modules/warc/MetadataRecordBuilder.java | 2 +- .../modules/warc/RevisitRecordBuilder.java | 2 +- .../modules/warc/WARCRecordBuilder.java | 64 ++---------------- .../warc/WhoisResponseRecordBuilder.java | 2 +- .../writer/WARCWriterChainProcessor.java | 4 +- .../modules/writer/WriterPoolProcessor.java | 4 +- 12 files changed, 82 insertions(+), 72 deletions(-) create mode 100644 modules/src/main/java/org/archive/modules/warc/BaseWARCRecordBuilder.java diff --git a/modules/src/main/java/org/archive/modules/warc/BaseWARCRecordBuilder.java b/modules/src/main/java/org/archive/modules/warc/BaseWARCRecordBuilder.java new file mode 100644 index 000000000..40eba8436 --- /dev/null +++ b/modules/src/main/java/org/archive/modules/warc/BaseWARCRecordBuilder.java @@ -0,0 +1,66 @@ +package org.archive.modules.warc; + +import static org.archive.modules.CoreAttributeConstants.A_DNS_SERVER_IP_LABEL; + +import java.net.InetAddress; +import java.net.URI; +import java.net.URISyntaxException; +import java.util.UUID; + +import org.archive.modules.CrawlURI; +import org.archive.modules.net.CrawlHost; +import org.archive.modules.net.ServerCache; +import org.springframework.beans.factory.annotation.Autowired; + +public abstract class BaseWARCRecordBuilder implements WARCRecordBuilder { + + transient protected ServerCache serverCache; + public ServerCache getServerCache() { + return this.serverCache; + } + @Autowired + public void setServerCache(ServerCache serverCache) { + this.serverCache = serverCache; + } + + public URI generateRecordID() { + try { + return new URI("urn:uuid:" + UUID.randomUUID()); + } catch (URISyntaxException e) { + throw new RuntimeException(e); // impossible + } + } + + /** + * Return IP address of given URI suitable for recording (as in a + * classic ARC 5-field header line). + * + * @param curi CrawlURI + * @return String of IP address + */ + protected String getHostAddress(CrawlURI curi) { + // special handling for DNS URIs: want address of DNS server + if (curi.getUURI().getScheme().toLowerCase().equals("dns")) { + return (String)curi.getData().get(A_DNS_SERVER_IP_LABEL); + } + // otherwise, host referenced in URI + // TODO:FIXME: have fetcher insert exact IP contacted into curi, + // use that rather than inferred by CrawlHost lookup + CrawlHost h = getServerCache().getHostFor(curi.getUURI()); + if (h == null) { + throw new NullPointerException("Crawlhost is null for " + + curi + " " + curi.getVia()); + } + InetAddress a = h.getIP(); + if (a == null) { + throw new NullPointerException("Address is null for " + + curi + " " + curi.getVia() + ". Address " + + ((h.getIpFetched() == CrawlHost.IP_NEVER_LOOKED_UP)? + "was never looked up.": + (System.currentTimeMillis() - h.getIpFetched()) + + " ms ago.")); + } + return h.getIP().getHostAddress(); + } + +} diff --git a/modules/src/main/java/org/archive/modules/warc/DnsResponseRecordBuilder.java b/modules/src/main/java/org/archive/modules/warc/DnsResponseRecordBuilder.java index 2751b3a04..c83942153 100644 --- a/modules/src/main/java/org/archive/modules/warc/DnsResponseRecordBuilder.java +++ b/modules/src/main/java/org/archive/modules/warc/DnsResponseRecordBuilder.java @@ -12,7 +12,7 @@ import org.archive.modules.CrawlURI; import org.archive.util.ArchiveUtils; -public class DnsResponseRecordBuilder extends WARCRecordBuilder { +public class DnsResponseRecordBuilder extends BaseWARCRecordBuilder { @Override public boolean shouldProcess(CrawlURI curi) { diff --git a/modules/src/main/java/org/archive/modules/warc/FtpControlConversationRecordBuilder.java b/modules/src/main/java/org/archive/modules/warc/FtpControlConversationRecordBuilder.java index 3d85c2eb0..e5481080e 100644 --- a/modules/src/main/java/org/archive/modules/warc/FtpControlConversationRecordBuilder.java +++ b/modules/src/main/java/org/archive/modules/warc/FtpControlConversationRecordBuilder.java @@ -14,7 +14,7 @@ import org.archive.util.ArchiveUtils; import org.archive.util.anvl.ANVLRecord; -public class FtpControlConversationRecordBuilder extends WARCRecordBuilder { +public class FtpControlConversationRecordBuilder extends BaseWARCRecordBuilder { @Override public boolean shouldProcess(CrawlURI curi) { diff --git a/modules/src/main/java/org/archive/modules/warc/FtpResponseRecordBuilder.java b/modules/src/main/java/org/archive/modules/warc/FtpResponseRecordBuilder.java index 60366ec06..f3068569f 100644 --- a/modules/src/main/java/org/archive/modules/warc/FtpResponseRecordBuilder.java +++ b/modules/src/main/java/org/archive/modules/warc/FtpResponseRecordBuilder.java @@ -13,7 +13,7 @@ import org.archive.modules.CrawlURI; import org.archive.util.ArchiveUtils; -public class FtpResponseRecordBuilder extends WARCRecordBuilder { +public class FtpResponseRecordBuilder extends BaseWARCRecordBuilder { @Override public boolean shouldProcess(CrawlURI curi) { diff --git a/modules/src/main/java/org/archive/modules/warc/HttpRequestRecordBuilder.java b/modules/src/main/java/org/archive/modules/warc/HttpRequestRecordBuilder.java index b71567aab..f2f55e549 100644 --- a/modules/src/main/java/org/archive/modules/warc/HttpRequestRecordBuilder.java +++ b/modules/src/main/java/org/archive/modules/warc/HttpRequestRecordBuilder.java @@ -12,7 +12,7 @@ import org.archive.modules.CrawlURI; import org.archive.util.ArchiveUtils; -public class HttpRequestRecordBuilder extends WARCRecordBuilder { +public class HttpRequestRecordBuilder extends BaseWARCRecordBuilder { @Override public boolean shouldProcess(CrawlURI curi) { diff --git a/modules/src/main/java/org/archive/modules/warc/HttpResponseRecordBuilder.java b/modules/src/main/java/org/archive/modules/warc/HttpResponseRecordBuilder.java index b6f69e885..06a1b7324 100644 --- a/modules/src/main/java/org/archive/modules/warc/HttpResponseRecordBuilder.java +++ b/modules/src/main/java/org/archive/modules/warc/HttpResponseRecordBuilder.java @@ -22,7 +22,7 @@ import org.archive.modules.CrawlURI; import org.archive.util.ArchiveUtils; -public class HttpResponseRecordBuilder extends WARCRecordBuilder { +public class HttpResponseRecordBuilder extends BaseWARCRecordBuilder { @Override public boolean shouldProcess(CrawlURI curi) { diff --git a/modules/src/main/java/org/archive/modules/warc/MetadataRecordBuilder.java b/modules/src/main/java/org/archive/modules/warc/MetadataRecordBuilder.java index 51a1547ad..9158810f4 100644 --- a/modules/src/main/java/org/archive/modules/warc/MetadataRecordBuilder.java +++ b/modules/src/main/java/org/archive/modules/warc/MetadataRecordBuilder.java @@ -16,7 +16,7 @@ import org.archive.util.ArchiveUtils; import org.archive.util.anvl.ANVLRecord; -public class MetadataRecordBuilder extends WARCRecordBuilder { +public class MetadataRecordBuilder extends BaseWARCRecordBuilder { /** * If you don't want metadata records, take this class out of the chain. diff --git a/modules/src/main/java/org/archive/modules/warc/RevisitRecordBuilder.java b/modules/src/main/java/org/archive/modules/warc/RevisitRecordBuilder.java index bbed451a2..aaaaac55a 100644 --- a/modules/src/main/java/org/archive/modules/warc/RevisitRecordBuilder.java +++ b/modules/src/main/java/org/archive/modules/warc/RevisitRecordBuilder.java @@ -18,7 +18,7 @@ import org.archive.modules.revisit.RevisitProfile; import org.archive.util.ArchiveUtils; -public class RevisitRecordBuilder extends WARCRecordBuilder { +public class RevisitRecordBuilder extends BaseWARCRecordBuilder { @Override public boolean shouldProcess(CrawlURI curi) { diff --git a/modules/src/main/java/org/archive/modules/warc/WARCRecordBuilder.java b/modules/src/main/java/org/archive/modules/warc/WARCRecordBuilder.java index a9ec884b1..ecd02db4c 100644 --- a/modules/src/main/java/org/archive/modules/warc/WARCRecordBuilder.java +++ b/modules/src/main/java/org/archive/modules/warc/WARCRecordBuilder.java @@ -1,72 +1,16 @@ package org.archive.modules.warc; -import static org.archive.modules.CoreAttributeConstants.A_DNS_SERVER_IP_LABEL; - import java.io.IOException; -import java.net.InetAddress; import java.net.URI; -import java.net.URISyntaxException; -import java.util.UUID; import org.archive.io.warc.WARCRecordInfo; import org.archive.modules.CrawlURI; -import org.archive.modules.net.CrawlHost; -import org.archive.modules.net.ServerCache; -import org.springframework.beans.factory.annotation.Autowired; -public abstract class WARCRecordBuilder { +public interface WARCRecordBuilder { - transient protected ServerCache serverCache; - public ServerCache getServerCache() { - return this.serverCache; - } - @Autowired - public void setServerCache(ServerCache serverCache) { - this.serverCache = serverCache; - } + boolean shouldProcess(CrawlURI curi); - public abstract boolean shouldProcess(CrawlURI curi); - public abstract WARCRecordInfo buildRecord(CrawlURI curi, URI concurrentTo) + WARCRecordInfo buildRecord(CrawlURI curi, URI concurrentTo) throws IOException; - - public URI generateRecordID() { - try { - return new URI("urn:uuid:" + UUID.randomUUID()); - } catch (URISyntaxException e) { - throw new RuntimeException(e); // impossible - } - } - - /** - * Return IP address of given URI suitable for recording (as in a - * classic ARC 5-field header line). - * - * @param curi CrawlURI - * @return String of IP address - */ - protected String getHostAddress(CrawlURI curi) { - // special handling for DNS URIs: want address of DNS server - if (curi.getUURI().getScheme().toLowerCase().equals("dns")) { - return (String)curi.getData().get(A_DNS_SERVER_IP_LABEL); - } - // otherwise, host referenced in URI - // TODO:FIXME: have fetcher insert exact IP contacted into curi, - // use that rather than inferred by CrawlHost lookup - CrawlHost h = getServerCache().getHostFor(curi.getUURI()); - if (h == null) { - throw new NullPointerException("Crawlhost is null for " + - curi + " " + curi.getVia()); - } - InetAddress a = h.getIP(); - if (a == null) { - throw new NullPointerException("Address is null for " + - curi + " " + curi.getVia() + ". Address " + - ((h.getIpFetched() == CrawlHost.IP_NEVER_LOOKED_UP)? - "was never looked up.": - (System.currentTimeMillis() - h.getIpFetched()) + - " ms ago.")); - } - return h.getIP().getHostAddress(); - } -} +} \ No newline at end of file diff --git a/modules/src/main/java/org/archive/modules/warc/WhoisResponseRecordBuilder.java b/modules/src/main/java/org/archive/modules/warc/WhoisResponseRecordBuilder.java index cefc1da80..8ddbb9d14 100644 --- a/modules/src/main/java/org/archive/modules/warc/WhoisResponseRecordBuilder.java +++ b/modules/src/main/java/org/archive/modules/warc/WhoisResponseRecordBuilder.java @@ -12,7 +12,7 @@ import org.archive.modules.CrawlURI; import org.archive.util.ArchiveUtils; -public class WhoisResponseRecordBuilder extends WARCRecordBuilder { +public class WhoisResponseRecordBuilder extends BaseWARCRecordBuilder { @Override public boolean shouldProcess(CrawlURI curi) { diff --git a/modules/src/main/java/org/archive/modules/writer/WARCWriterChainProcessor.java b/modules/src/main/java/org/archive/modules/writer/WARCWriterChainProcessor.java index 84e1b6f36..6c04c06b2 100644 --- a/modules/src/main/java/org/archive/modules/writer/WARCWriterChainProcessor.java +++ b/modules/src/main/java/org/archive/modules/writer/WARCWriterChainProcessor.java @@ -39,10 +39,10 @@ public class WARCWriterChainProcessor extends BaseWARCWriterProcessor implements new MetadataRecordBuilder())); } @SuppressWarnings("unchecked") - public List getChain() { + public List getChain() { return (List) kp.get("chain"); } - public void setChain(List chain) { + public void setChain(List chain) { kp.put("chain", chain); } diff --git a/modules/src/main/java/org/archive/modules/writer/WriterPoolProcessor.java b/modules/src/main/java/org/archive/modules/writer/WriterPoolProcessor.java index 12a9cabd0..a5a030e81 100644 --- a/modules/src/main/java/org/archive/modules/writer/WriterPoolProcessor.java +++ b/modules/src/main/java/org/archive/modules/writer/WriterPoolProcessor.java @@ -45,7 +45,7 @@ import org.archive.modules.deciderules.recrawl.IdenticalDigestDecideRule; import org.archive.modules.net.CrawlHost; import org.archive.modules.net.ServerCache; -import org.archive.modules.warc.WARCRecordBuilder; +import org.archive.modules.warc.BaseWARCRecordBuilder; import org.archive.spring.ConfigPath; import org.archive.util.FileUtils; import org.json.JSONException; @@ -372,7 +372,7 @@ protected boolean shouldWrite(CrawlURI curi) { * @param curi CrawlURI * @return String of IP address * - * @deprecated WARCRecordBuilder instances use {@link WARCRecordBuilder#getHostAddress(CrawlURI)} + * @deprecated WARCRecordBuilder instances use {@link BaseWARCRecordBuilder#getHostAddress(CrawlURI)} */ @Deprecated protected String getHostAddress(CrawlURI curi) { From 5177b2b6dafcedb5ddfc9793ba9cfafa01f6e34f Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 12 Jun 2019 15:17:33 -0700 Subject: [PATCH 012/123] rename method so as not to conflict with Processor --- .../java/org/archive/modules/warc/DnsResponseRecordBuilder.java | 2 +- .../modules/warc/FtpControlConversationRecordBuilder.java | 2 +- .../java/org/archive/modules/warc/FtpResponseRecordBuilder.java | 2 +- .../java/org/archive/modules/warc/HttpRequestRecordBuilder.java | 2 +- .../org/archive/modules/warc/HttpResponseRecordBuilder.java | 2 +- .../java/org/archive/modules/warc/MetadataRecordBuilder.java | 2 +- .../java/org/archive/modules/warc/RevisitRecordBuilder.java | 2 +- .../main/java/org/archive/modules/warc/WARCRecordBuilder.java | 2 +- .../org/archive/modules/warc/WhoisResponseRecordBuilder.java | 2 +- .../org/archive/modules/writer/WARCWriterChainProcessor.java | 2 +- 10 files changed, 10 insertions(+), 10 deletions(-) diff --git a/modules/src/main/java/org/archive/modules/warc/DnsResponseRecordBuilder.java b/modules/src/main/java/org/archive/modules/warc/DnsResponseRecordBuilder.java index c83942153..f982a6aaf 100644 --- a/modules/src/main/java/org/archive/modules/warc/DnsResponseRecordBuilder.java +++ b/modules/src/main/java/org/archive/modules/warc/DnsResponseRecordBuilder.java @@ -15,7 +15,7 @@ public class DnsResponseRecordBuilder extends BaseWARCRecordBuilder { @Override - public boolean shouldProcess(CrawlURI curi) { + public boolean shouldBuildRecord(CrawlURI curi) { return "dns".equals(curi.getUURI().getScheme().toLowerCase()); } diff --git a/modules/src/main/java/org/archive/modules/warc/FtpControlConversationRecordBuilder.java b/modules/src/main/java/org/archive/modules/warc/FtpControlConversationRecordBuilder.java index e5481080e..de8c978ae 100644 --- a/modules/src/main/java/org/archive/modules/warc/FtpControlConversationRecordBuilder.java +++ b/modules/src/main/java/org/archive/modules/warc/FtpControlConversationRecordBuilder.java @@ -17,7 +17,7 @@ public class FtpControlConversationRecordBuilder extends BaseWARCRecordBuilder { @Override - public boolean shouldProcess(CrawlURI curi) { + public boolean shouldBuildRecord(CrawlURI curi) { return "ftp".equals(curi.getUURI().getScheme().toLowerCase()); } diff --git a/modules/src/main/java/org/archive/modules/warc/FtpResponseRecordBuilder.java b/modules/src/main/java/org/archive/modules/warc/FtpResponseRecordBuilder.java index f3068569f..ddd9444be 100644 --- a/modules/src/main/java/org/archive/modules/warc/FtpResponseRecordBuilder.java +++ b/modules/src/main/java/org/archive/modules/warc/FtpResponseRecordBuilder.java @@ -16,7 +16,7 @@ public class FtpResponseRecordBuilder extends BaseWARCRecordBuilder { @Override - public boolean shouldProcess(CrawlURI curi) { + public boolean shouldBuildRecord(CrawlURI curi) { return !curi.isRevisit() && "ftp".equals(curi.getUURI().getScheme().toLowerCase()); } diff --git a/modules/src/main/java/org/archive/modules/warc/HttpRequestRecordBuilder.java b/modules/src/main/java/org/archive/modules/warc/HttpRequestRecordBuilder.java index f2f55e549..cd51e21de 100644 --- a/modules/src/main/java/org/archive/modules/warc/HttpRequestRecordBuilder.java +++ b/modules/src/main/java/org/archive/modules/warc/HttpRequestRecordBuilder.java @@ -15,7 +15,7 @@ public class HttpRequestRecordBuilder extends BaseWARCRecordBuilder { @Override - public boolean shouldProcess(CrawlURI curi) { + public boolean shouldBuildRecord(CrawlURI curi) { return curi.getUURI().getScheme().toLowerCase().startsWith("http"); } diff --git a/modules/src/main/java/org/archive/modules/warc/HttpResponseRecordBuilder.java b/modules/src/main/java/org/archive/modules/warc/HttpResponseRecordBuilder.java index 06a1b7324..4b54ea26e 100644 --- a/modules/src/main/java/org/archive/modules/warc/HttpResponseRecordBuilder.java +++ b/modules/src/main/java/org/archive/modules/warc/HttpResponseRecordBuilder.java @@ -25,7 +25,7 @@ public class HttpResponseRecordBuilder extends BaseWARCRecordBuilder { @Override - public boolean shouldProcess(CrawlURI curi) { + public boolean shouldBuildRecord(CrawlURI curi) { return !curi.isRevisit() && curi.getUURI().getScheme().toLowerCase().startsWith("http"); } diff --git a/modules/src/main/java/org/archive/modules/warc/MetadataRecordBuilder.java b/modules/src/main/java/org/archive/modules/warc/MetadataRecordBuilder.java index 9158810f4..c167740f4 100644 --- a/modules/src/main/java/org/archive/modules/warc/MetadataRecordBuilder.java +++ b/modules/src/main/java/org/archive/modules/warc/MetadataRecordBuilder.java @@ -22,7 +22,7 @@ public class MetadataRecordBuilder extends BaseWARCRecordBuilder { * If you don't want metadata records, take this class out of the chain. */ @Override - public boolean shouldProcess(CrawlURI curi) { + public boolean shouldBuildRecord(CrawlURI curi) { String scheme = curi.getUURI().getScheme().toLowerCase(); return scheme.startsWith("http") || "ftp".equals(scheme); } diff --git a/modules/src/main/java/org/archive/modules/warc/RevisitRecordBuilder.java b/modules/src/main/java/org/archive/modules/warc/RevisitRecordBuilder.java index aaaaac55a..6234dd5f8 100644 --- a/modules/src/main/java/org/archive/modules/warc/RevisitRecordBuilder.java +++ b/modules/src/main/java/org/archive/modules/warc/RevisitRecordBuilder.java @@ -21,7 +21,7 @@ public class RevisitRecordBuilder extends BaseWARCRecordBuilder { @Override - public boolean shouldProcess(CrawlURI curi) { + public boolean shouldBuildRecord(CrawlURI curi) { String scheme = curi.getUURI().getScheme().toLowerCase(); return curi.isRevisit() && (scheme.startsWith("http") || scheme.equals("ftp")); diff --git a/modules/src/main/java/org/archive/modules/warc/WARCRecordBuilder.java b/modules/src/main/java/org/archive/modules/warc/WARCRecordBuilder.java index ecd02db4c..afd19bbe9 100644 --- a/modules/src/main/java/org/archive/modules/warc/WARCRecordBuilder.java +++ b/modules/src/main/java/org/archive/modules/warc/WARCRecordBuilder.java @@ -8,7 +8,7 @@ public interface WARCRecordBuilder { - boolean shouldProcess(CrawlURI curi); + boolean shouldBuildRecord(CrawlURI curi); WARCRecordInfo buildRecord(CrawlURI curi, URI concurrentTo) throws IOException; diff --git a/modules/src/main/java/org/archive/modules/warc/WhoisResponseRecordBuilder.java b/modules/src/main/java/org/archive/modules/warc/WhoisResponseRecordBuilder.java index 8ddbb9d14..3726f500b 100644 --- a/modules/src/main/java/org/archive/modules/warc/WhoisResponseRecordBuilder.java +++ b/modules/src/main/java/org/archive/modules/warc/WhoisResponseRecordBuilder.java @@ -15,7 +15,7 @@ public class WhoisResponseRecordBuilder extends BaseWARCRecordBuilder { @Override - public boolean shouldProcess(CrawlURI curi) { + public boolean shouldBuildRecord(CrawlURI curi) { return "whois".equals(curi.getUURI().getScheme().toLowerCase()); } diff --git a/modules/src/main/java/org/archive/modules/writer/WARCWriterChainProcessor.java b/modules/src/main/java/org/archive/modules/writer/WARCWriterChainProcessor.java index 6c04c06b2..895f72608 100644 --- a/modules/src/main/java/org/archive/modules/writer/WARCWriterChainProcessor.java +++ b/modules/src/main/java/org/archive/modules/writer/WARCWriterChainProcessor.java @@ -128,7 +128,7 @@ protected ProcessResult write(final CrawlURI curi) protected void writeRecords(CrawlURI curi, WARCWriter writer) throws IOException { URI concurrentTo = null; for (WARCRecordBuilder recordBuilder: getChain()) { - if (recordBuilder.shouldProcess(curi)) { + if (recordBuilder.shouldBuildRecord(curi)) { WARCRecordInfo record = recordBuilder.buildRecord(curi, concurrentTo); if (record != null) { writer.writeRecord(record); From bff33e00e6716efe95e5f29b01b167e6470e869a Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 12 Jun 2019 17:40:57 -0700 Subject: [PATCH 013/123] write youtube-dl json to the warc ExtractorYoutubeDL implements WARCRecordBuilder --- .../modules/extractor/ExtractorYoutubeDL.java | 72 ++++++++++++++++++- .../modules/warc/BaseWARCRecordBuilder.java | 2 +- 2 files changed, 71 insertions(+), 3 deletions(-) diff --git a/contrib/src/main/java/org/archive/modules/extractor/ExtractorYoutubeDL.java b/contrib/src/main/java/org/archive/modules/extractor/ExtractorYoutubeDL.java index d250fd05f..723d914c3 100644 --- a/contrib/src/main/java/org/archive/modules/extractor/ExtractorYoutubeDL.java +++ b/contrib/src/main/java/org/archive/modules/extractor/ExtractorYoutubeDL.java @@ -19,9 +19,14 @@ package org.archive.modules.extractor; +import static org.archive.format.warc.WARCConstants.HEADER_KEY_CONCURRENT_TO; + +import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.io.Reader; +import java.io.StringWriter; +import java.net.URI; import java.util.concurrent.Callable; import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutorService; @@ -32,8 +37,12 @@ import org.apache.commons.httpclient.URIException; import org.archive.crawler.reporting.CrawlerLoggerModule; +import org.archive.format.warc.WARCConstants.WARCRecordType; +import org.archive.io.warc.WARCRecordInfo; import org.archive.modules.CoreAttributeConstants; import org.archive.modules.CrawlURI; +import org.archive.modules.warc.BaseWARCRecordBuilder; +import org.archive.modules.warc.WARCRecordBuilder; import org.archive.net.UURI; import org.archive.net.UURIFactory; import org.archive.util.ArchiveUtils; @@ -46,16 +55,34 @@ import com.google.gson.JsonObject; import com.google.gson.JsonParseException; import com.google.gson.JsonStreamParser; +import com.google.gson.internal.Streams; +import com.google.gson.stream.JsonWriter; /** * Extracts links to media by running youtube-dl in a subprocess. Runs only on * html. * *

+ * Also implements {@link WARCRecordBuilder} to write youtube-dl json to the + * warc. + * + *

+ * To use ExtractorYoutubeDL, add this top-level bean: + * + *

+ * <bean id="extractorYoutubeDL" class="org.archive.modules.extractor.ExtractorYoutubeDL"/>
+ * 
+ * + * Then add <ref bean="extractorYoutubeDL"/> to end of the + * fetch chain, and to the end of the warc writer chain. + * + *

* Keeps a log of containing pages and media captured as a result of youtube-dl * extraction. The format of the log is as follows: * - *

[timestamp] [media-http-status] [media-length] [media-mimetype] [media-digest] [media-timestamp] [media-url] [annotation] [containing-page-digest] [containing-page-timestamp] [containing-page-url] [seed-url]
+ *
+ * [timestamp] [media-http-status] [media-length] [media-mimetype] [media-digest] [media-timestamp] [media-url] [annotation] [containing-page-digest] [containing-page-timestamp] [containing-page-url] [seed-url]
+ * 
* *

* For containing pages, all of the {@code media-*} fields have the value @@ -71,7 +98,8 @@ * * @author nlevitt */ -public class ExtractorYoutubeDL extends Extractor implements Lifecycle { +public class ExtractorYoutubeDL extends Extractor + implements Lifecycle, WARCRecordBuilder { private static Logger logger = Logger.getLogger(ExtractorYoutubeDL.class.getName()); @@ -149,6 +177,10 @@ protected void extract(CrawlURI uri) { addVideoOutlink(uri, json, videoUrl); } } + + // XXX this can be large, consider using a RecordingOutputStream + uri.getData().put("ydlJson", ydlJson); + String annotation = "youtube-dl:" + jsonEntries.size(); uri.getAnnotations().add(annotation); logContainingPage(uri, annotation); @@ -400,4 +432,40 @@ protected boolean shouldExtract(CrawlURI uri) { return false; } + + @Override + public boolean shouldBuildRecord(CrawlURI curi) { + return curi.containsDataKey("ydlJson"); + } + + @Override + public WARCRecordInfo buildRecord(CrawlURI curi, URI concurrentTo) + throws IOException { + final String timestamp = + ArchiveUtils.getLog14Date(curi.getFetchBeginTime()); + + WARCRecordInfo recordInfo = new WARCRecordInfo(); + recordInfo.setType(WARCRecordType.metadata); + recordInfo.setRecordId(BaseWARCRecordBuilder.generateRecordID()); + if (concurrentTo != null) { + recordInfo.addExtraHeader(HEADER_KEY_CONCURRENT_TO, + "<" + concurrentTo + ">"); + } + recordInfo.setUrl("youtube-dl:" + curi); + recordInfo.setCreate14DigitDate(timestamp); + recordInfo.setMimetype("application/vnd.youtube-dl_formats+json;charset=utf-8"); + recordInfo.setEnforceLength(true); + + JsonObject ydlJson = (JsonObject) curi.getData().get("ydlJson"); + StringWriter stringWriter = new StringWriter(); + JsonWriter jsonWriter = new JsonWriter(stringWriter); + jsonWriter.setIndent(" "); + Streams.write(ydlJson, jsonWriter); + + byte[] b = stringWriter.toString().getBytes("UTF-8"); + recordInfo.setContentStream(new ByteArrayInputStream(b)); + recordInfo.setContentLength((long) b.length); + + return recordInfo; + } } diff --git a/modules/src/main/java/org/archive/modules/warc/BaseWARCRecordBuilder.java b/modules/src/main/java/org/archive/modules/warc/BaseWARCRecordBuilder.java index 40eba8436..20dbf5569 100644 --- a/modules/src/main/java/org/archive/modules/warc/BaseWARCRecordBuilder.java +++ b/modules/src/main/java/org/archive/modules/warc/BaseWARCRecordBuilder.java @@ -23,7 +23,7 @@ public void setServerCache(ServerCache serverCache) { this.serverCache = serverCache; } - public URI generateRecordID() { + public static URI generateRecordID() { try { return new URI("urn:uuid:" + UUID.randomUUID()); } catch (URISyntaxException e) { From 74a0222e5686a2e28cfe517038e211fc62cc0dc0 Mon Sep 17 00:00:00 2001 From: Alex Osborne Date: Fri, 2 Aug 2019 15:47:02 +0900 Subject: [PATCH 014/123] Travis fixes - use trusty dist image for oraclejdk8 and openjdk7 as they're [unavailable] in the new xenial image - jdk7: skip building contrib rather than allowing failures - contrib is now in the parent pom so remove its separate build command - start testing against openjdk11 but allow failures for now [unavailable]: https://travis-ci.community/t/install-of-oracle-jdk-8-failing/3038/9 --- .travis.yml | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/.travis.yml b/.travis.yml index 43c4eec65..20978aa4d 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,16 +1,19 @@ + sudo: false language: java - -jdk: - - oraclejdk8 - - openjdk7 - - openjdk8 - matrix: - allow_failures: + include: + - jdk: oraclejdk8 + dist: trusty - jdk: openjdk7 - + dist: trusty + env: PROJECTS='--projects commons,modules,engine' + - jdk: openjdk8 + - jdk: openjdk11 + allow_failures: + - jdk: openjdk11 + before_install: - "export JAVA_OPTS=-Xmx1500m" - "echo JAVA_OPTS=$JAVA_OPTS" @@ -18,14 +21,15 @@ before_install: - "echo MAVEN_OPTS=$MAVEN_OPTS" - "export _JAVA_OPTIONS=-Xmx1500m" - "echo _JAVA_OPTIONS=$_JAVA_OPTIONS" + +install: mvn dependency:resolve -B -V $PROJECTS cache: directories: - $HOME/.m2 script: - - travis_wait 30 mvn install - - cd contrib && mvn install + - travis_wait 30 mvn install $PROJECTS after_failure: - cat */target/surefire-reports/*.txt From 412885fd7d03fbb14b04d3e58856c4b27cc04853 Mon Sep 17 00:00:00 2001 From: Alex Osborne Date: Fri, 2 Aug 2019 16:43:20 +0900 Subject: [PATCH 015/123] JDK11 support: upgrade maven-surefire-plugin to 2.22.2 Under JDK11 our old version of it throws ClassNotFoundExceptions when tests load some builtin classes like javax.transaction.xa.Xid. Fixes #266 --- commons/pom.xml | 2 +- dist/pom.xml | 2 +- engine/pom.xml | 2 +- modules/pom.xml | 2 +- pom.xml | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/commons/pom.xml b/commons/pom.xml index 3a9da7dcc..3a121b022 100644 --- a/commons/pom.xml +++ b/commons/pom.xml @@ -221,7 +221,7 @@ org.apache.maven.plugins maven-surefire-plugin - 2.9 + 2.22.2 + + javax.xml.bind + jaxb-api + 2.3.1 + + + org.glassfish.jaxb + jaxb-runtime + 2.3.1 + runtime + org.littleshoot littleproxy From bf819e19a5ca4adcf4c2c1e3b4b8a56b2bba87e9 Mon Sep 17 00:00:00 2001 From: Alex Osborne Date: Fri, 2 Aug 2019 13:44:49 +0900 Subject: [PATCH 017/123] JDK11 support: exclude tools.jar from hbase-client dependency tools.jar is no longer included in the JDK as of JDK 11 (per JEP 220) so exclude it as a dependency. According to [HBASE-13963] it is only required when compiling hbase itself and was unintentionally leaked as a transitive dependency. Fixes #265 [HBASE-13963]: https://issues.apache.org/jira/browse/HBASE-13963 --- contrib/pom.xml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/contrib/pom.xml b/contrib/pom.xml index 5a1e1ba62..f6afde535 100644 --- a/contrib/pom.xml +++ b/contrib/pom.xml @@ -26,6 +26,13 @@ junit junit + + + jdk.tools + jdk.tools + From a5c2b4620876457f1e54235ff787064a875c7493 Mon Sep 17 00:00:00 2001 From: Alex Osborne Date: Fri, 2 Aug 2019 17:10:13 +0900 Subject: [PATCH 018/123] JDK11 support: remove unused class ObjectIdentityBdbCache and tests In 2011 ObjectIdentityBdbCache was replaced by ObjectIdentityBdbManualCache in order to "remove dependence on GC/finalization/PhantomReference magic". It hasn't been used since. As of JDK11 the ObjectIdentityBdbCache unit tests are failing so let's remove it. --- .../archive/util/ObjectIdentityBdbCache.java | 564 ------------------ .../util/ObjectIdentityBdbCacheTest.java | 199 ------ 2 files changed, 763 deletions(-) delete mode 100644 commons/src/main/java/org/archive/util/ObjectIdentityBdbCache.java delete mode 100644 commons/src/test/java/org/archive/util/ObjectIdentityBdbCacheTest.java diff --git a/commons/src/main/java/org/archive/util/ObjectIdentityBdbCache.java b/commons/src/main/java/org/archive/util/ObjectIdentityBdbCache.java deleted file mode 100644 index fd707c53c..000000000 --- a/commons/src/main/java/org/archive/util/ObjectIdentityBdbCache.java +++ /dev/null @@ -1,564 +0,0 @@ -/* - * This file is part of the Heritrix web crawler (crawler.archive.org). - * - * Licensed to the Internet Archive (IA) by one or more individual - * contributors. - * - * The IA licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.archive.util; - -import java.io.Closeable; -import java.io.Serializable; -import java.lang.ref.PhantomReference; -import java.lang.ref.Reference; -import java.lang.ref.ReferenceQueue; -import java.lang.ref.SoftReference; -import java.lang.reflect.Field; -import java.util.Set; -import java.util.concurrent.ConcurrentHashMap; -import java.util.concurrent.atomic.AtomicLong; -import java.util.logging.Level; -import java.util.logging.Logger; - -import org.archive.bdb.KryoBinding; - -import com.sleepycat.bind.EntryBinding; -import com.sleepycat.bind.serial.StoredClassCatalog; -import com.sleepycat.bind.tuple.TupleBinding; -import com.sleepycat.collections.StoredSortedMap; -import com.sleepycat.je.Database; -import com.sleepycat.je.DatabaseConfig; -import com.sleepycat.je.DatabaseException; -import com.sleepycat.je.Environment; - -/** - * A BDB JE backed object cache. - * - * Soft references to previously-instantiated objects are held so that - * unless/until an object is garbage collected, subsequent get()s will - * return the exact same object. (If all outside references are lost, - * when the soft reference is broken, the object state -- still - * accessible to this class via reflective access to a phantom - * referent --is flushed to disk. The next get() will reconsitute a new - * object, from the disk state.) - *

- * The backing disk is only guaranteed to be up-to-date after a flush - * of all in-memory values to disk, as can be forced by sync(). - *

- * To ensure that changes/mutations to values in this map are coherent and - * consistent at the application level, it is assumed that the application - * level only mutates values that are in this map and does not retain references - * to values longer than necessary. This allows mappings to be persisted - * during GC without explicit transactions or write operations. - *

- * Based on the earlier CachedBdbMap. - *

- * - * @author John Erik Halse - * @author stack - * @author gojomo - * @author paul baclace (conversion to ConcurrentMap) - * - */ -public class ObjectIdentityBdbCache -implements ObjectIdentityCache, Closeable, Serializable { - private static final long serialVersionUID = 1L; - private static final Logger logger = - Logger.getLogger(ObjectIdentityBdbCache.class.getName()); - - /** The BDB JE database used for this instance. */ - protected transient Database db; - - /** in-memory map of new/recent/still-referenced-elsewhere instances */ - protected transient ConcurrentHashMap> memMap; - protected transient ReferenceQueue refQueue; - - /** The Collection view of the BDB JE database used for this instance. */ - protected transient StoredSortedMap diskMap; - - protected AtomicLong count; - - // - // USAGE STATS - // - /** Count of times we got an object from in-memory cache */ - private AtomicLong cacheHit = new AtomicLong(0); - /** Count of times the {@link ObjectIdentityBdbCache#get} method was called. */ - private AtomicLong countOfGets = new AtomicLong(0); - /** Count of every time disk-based map provided non-null object */ - private AtomicLong diskHit = new AtomicLong(0); - /** Count of times Supplier was used for new object */ - private AtomicLong supplierUsed = new AtomicLong(0); - /** count of expunge put() to BDB (implies disk) */ - private AtomicLong expungeStatsDiskPut = new AtomicLong(0); - /** count of {@link #sync()} use */ - transient private AtomicLong useStatsSyncUsed = new AtomicLong(0); - - /** Reference to the Reference#referent Field. */ - protected static Field referentField; - static { - // We need access to the referent field in the PhantomReference. - // For more on this trick, see - // - // http://www.javaspecialists.co.za/archive/Issue098.html and for - // discussion: - // http://www.theserverside.com/tss?service=direct/0/NewsThread/threadViewer.markNoisy.link&sp=l29865&sp=l146901 - try { - referentField = Reference.class.getDeclaredField("referent"); - referentField.setAccessible(true); - } catch (SecurityException e) { - throw new RuntimeException(e); - } catch (NoSuchFieldException e) { - throw new RuntimeException(e); - } - } - - /** - * Constructor. You must call - * {@link #initialize(Environment, String, Class, StoredClassCatalog)} - * to finish construction. Construction is two-stepped to support - * reconnecting a deserialized CachedBdbMap with its backing bdbje - * database. - */ - public ObjectIdentityBdbCache() { - super(); - } - - /** - * Call this method when you have an instance when you used the - * default constructor or when you have a deserialized instance that you - * want to reconnect with an extant bdbje environment. - * @param env - * @param dbName - * @param valueClass - * @param classCatalog - * @throws DatabaseException - */ - public void initialize(final Environment env, String dbName, - final Class valueClass, final StoredClassCatalog classCatalog) - throws DatabaseException { - // TODO: initial capacity should be related to number of seeds, max depth, max docs - this.memMap = new ConcurrentHashMap>( - 8192, // initial capacity - 0.9f, // acceptable load factor - 64 // est. number of concurrent threads - ); - this.refQueue = new ReferenceQueue(); - canary = new SoftReference(new LowMemoryCanary()); - - this.db = openDatabase(env, dbName); - this.diskMap = createDiskMap(this.db, classCatalog, valueClass); - this.count = new AtomicLong(diskMap.size()); - } - - @SuppressWarnings("unchecked") - protected StoredSortedMap createDiskMap(Database database, - StoredClassCatalog classCatalog, Class valueClass) { - EntryBinding keyBinding = TupleBinding.getPrimitiveBinding(String.class); - EntryBinding valueBinding = TupleBinding.getPrimitiveBinding(valueClass); - if(valueBinding == null) { - valueBinding = - new KryoBinding(valueClass); -// new SerialBinding(classCatalog, valueClass); -// new BenchmarkingBinding(new EntryBinding[] { -// new KryoBinding(valueClass), -// new RecyclingSerialBinding(classCatalog, valueClass), -// }, valueClass); - } - return new StoredSortedMap(database, keyBinding, valueBinding, true); - } - - protected Database openDatabase(final Environment environment, - final String dbName) throws DatabaseException { - DatabaseConfig dbConfig = new DatabaseConfig(); - dbConfig.setTransactional(false); - dbConfig.setAllowCreate(true); - dbConfig.setDeferredWrite(true); - return environment.openDatabase(null, dbName, dbConfig); - } - - /* (non-Javadoc) - * @see org.archive.util.ObjectIdentityCache#close() - */ - public synchronized void close() { - // Close out my bdb db. - if (this.db != null) { - try { - sync(); - this.db.sync(); - this.db.close(); - } catch (DatabaseException e) { - logger.log(Level.WARNING,"problem closing ObjectIdentityBdbCache",e); - } finally { - this.db = null; - } - } - } - - protected void finalize() throws Throwable { - close(); - super.finalize(); - } - - /* (non-Javadoc) - * @see org.archive.util.ObjectIdentityCache#get(java.lang.String) - */ - public V get(final String key) { - return getOrUse(key,null); - } - - /* (non-Javadoc) - * @see org.archive.util.ObjectIdentityCache#get(java.lang.String, org.archive.util.ObjectIdentityBdbCache) - */ - public V getOrUse(final String key, Supplier supplierOrNull) { - countOfGets.incrementAndGet(); - - if (countOfGets.get() % 10000 == 0) { - logCacheSummary(); - } - - // check mem cache - SoftEntry entry = memMap.get(key); - if(entry != null) { - V val = entry.get(); - if(val != null) { - // the concurrent garden path: in mem, valid - cacheHit.incrementAndGet(); - val.setIdentityCache(this); - return val; - } - } - - // everything in other difficult cases happens inside this block - synchronized(this) { - // recheck mem cache -- if another thread beat us into sync - // block and already filled the key - entry = memMap.get(key); - if(entry != null) { - V val = entry.get(); - if(val != null) { - cacheHit.incrementAndGet(); - val.setIdentityCache(this); - return val; - } - } - // persist to disk all ref-enqueued stale (soft-ref-cleared) entries now - pageOutStaleEntries(); - // and catch if this exact entry not yet ref-enqueued - if(memMap.get(key)!=null) { - pageOutStaleEntry(entry); - if(memMap.get(key)!=null) { - logger.log(Level.SEVERE,"nulled key "+key+" not paged-out", new Exception()); - } - } - - // check disk - V valDisk = (V) diskMap.get(key); - if(valDisk==null) { - // never yet created, consider creating - if(supplierOrNull==null) { - return null; - } - // create using provided Supplier - valDisk = supplierOrNull.get(); - supplierUsed.incrementAndGet(); - // putting initial value directly into diskMap - // (rather than just the memMap until page-out) - // ensures diskMap.keySet() provides complete view - V prevVal = diskMap.putIfAbsent(key, valDisk); - count.incrementAndGet(); - if(prevVal!=null) { - // ERROR: diskMap modification since previous - // diskMap.get() should be impossible - logger.log(Level.SEVERE,"diskMap modified outside synchronized block?"); - } - } else { - diskHit.incrementAndGet(); - } - - // keep new val in memMap - SoftEntry newEntry = new SoftEntry(key, valDisk, refQueue); - SoftEntry prevVal = memMap.putIfAbsent(key, newEntry); - if(prevVal != null) { - // ERROR: memMap modification since previous - // memMap.get() should be impossible - logger.log(Level.SEVERE,"memMap modified outside synchronized block?", new Exception()); - } - valDisk.setIdentityCache(this); - return valDisk; - } - } - - /* (non-Javadoc) - * @see org.archive.util.ObjectIdentityCache#keySet() - */ - public Set keySet() { - return diskMap.keySet(); - } - - /** - * Summary to log, if at FINE level - */ - private void logCacheSummary() { - if (logger.isLoggable((Level.FINE))) { - logger.fine(composeCacheSummary()); - } - } - - protected String composeCacheSummary() { - long totalHits = cacheHit.get() + diskHit.get(); - if (totalHits < 1) { - return ""; - } - long cacheHitPercent - = (cacheHit.get() * 100) / totalHits; - StringBuilder sb = new StringBuilder(120); - sb.append("DB name:") - .append(getDatabaseName()) - .append(", ") - .append(" hit%: ") - .append(cacheHitPercent) - .append("%, gets=") - .append(countOfGets.get()) - .append(" memHits=") - .append(cacheHit.get()) - .append(" diskHits=") - .append(diskHit.get()) - .append(" supplieds=") - .append(supplierUsed.get()) - .append(" expungePuts=") - .append(expungeStatsDiskPut.get()) - .append(" syncs=") - .append(useStatsSyncUsed.get()); - return sb.toString(); - } - - /* (non-Javadoc) - * @see org.archive.util.ObjectIdentityCache#size() - */ - public int size() { - if(db==null) { - return 0; - } - return (int) count.get(); - } - - protected String getDatabaseName() { - String name = "DbName-Lookup-Failed"; - try { - if (this.db != null) { - name = this.db.getDatabaseName(); - } - } catch (DatabaseException e) { - // Ignore. - } - return name; - } - - /** - * Sync all in-memory map entries to backing disk store. - */ - public synchronized void sync() { - String dbName = null; - // Sync. memory and disk. - useStatsSyncUsed.incrementAndGet(); - long startTime = 0; - if (logger.isLoggable(Level.FINE)) { - dbName = getDatabaseName(); - startTime = System.currentTimeMillis(); - logger.fine(dbName + " start sizes: disk " + this.diskMap.size() + - ", mem " + this.memMap.size()); - } - - for (String key : this.memMap.keySet()) { - SoftEntry entry = memMap.get(key); - if (entry != null) { - // Get & hold so not cleared pre-return. - V value = entry.get(); - if (value != null) { - expungeStatsDiskPut.incrementAndGet(); - this.diskMap.put(key, value); // unchecked cast - } - } - } - pageOutStaleEntries(); - - // force sync of deferred-writes - try { - this.db.sync(); - } catch (DatabaseException e) { - throw new RuntimeException(e); - } - - if (logger.isLoggable(Level.FINE)) { - logger.fine(dbName + " sync took " + - (System.currentTimeMillis() - startTime) + "ms. " + - "Finish sizes: disk " + - this.diskMap.size() + ", mem " + this.memMap.size()); - } - } - - @Override - public void dirtyKey(String key) { - // do nothing, because our weak/phantom trickery is supposed to - // ensure sync-to-persistence if/when dereferenced and collected - } - - /** An incremental, poll-based expunger. - * - * Package-protected for unit-test visibility. - */ - @SuppressWarnings("unchecked") - protected synchronized void pageOutStaleEntries() { - int c = 0; - long startTime = System.currentTimeMillis(); - for(SoftEntry entry; (entry = (SoftEntry)refQueue.poll()) != null;) { - pageOutStaleEntry(entry); - c++; - } - if (c > 0 && logger.isLoggable(Level.FINER)) { - long endTime = System.currentTimeMillis(); - try { - logger.finer("DB: " + db.getDatabaseName() + ", Expunged: " - + c + ", Diskmap size: " + diskMap.size() - + ", Cache size: " + memMap.size() - + ", in "+(endTime-startTime)+"ms"); - } catch (DatabaseException e) { - logger.log(Level.FINER,"exception while logging",e); - } - } - } - - /** - * Expunge an entry from memMap while updating diskMap. - * - * @param entry a SoftEntry obtained from refQueuePoll() - */ - synchronized private void pageOutStaleEntry(SoftEntry entry) { - PhantomEntry phantom = entry.phantom; - - // Still in memMap? if not, was paged-out by earlier direct access - // before placed into reference-queue; just return - if (memMap.get(phantom.key) != entry) { // NOTE: intentional identity compare - return; - } - - // recover hidden value - V phantomValue = phantom.doctoredGet(); - - // Expected value present? (should be; only clear is at end of - // this method, after entry removal from memMap) - if(phantomValue == null) { - logger.log(Level.WARNING,"unexpected null phantomValue", new Exception()); - return; // nothing to do - } - - // given instance entry still in memMap; - // we have the key and phantom Value, - // the diskMap can be updated. - diskMap.put(phantom.key, phantomValue); // unchecked cast - expungeStatsDiskPut.incrementAndGet(); - - // remove memMap entry - boolean removed = memMap.remove(phantom.key, entry); - if(!removed) { - logger.log(Level.WARNING,"expunge memMap.remove() ineffective",new Exception()); - } - phantom.clear(); // truly allows GC of unreferenced V object - } - - private static class PhantomEntry extends PhantomReference { - protected final String key; - - public PhantomEntry(String key, V referent) { - super(referent, null); - this.key = key; - } - - /** - * @return Return the referent. The contract for {@link #get()} - * always returns a null referent. We've cheated and doctored - * PhantomReference to return the actual referent value. See notes - * at {@link #referentField}; - */ - @SuppressWarnings("unchecked") - final public V doctoredGet() { - try { - // Here we use the referentField saved off on static - // initialization of this class to get at this References' - // private referent field. - return (V) referentField.get(this); - } catch (IllegalAccessException e) { - throw new RuntimeException(e); - } - } - } - - /** - * SoftReference cache entry. - * - * A PhantomReference is used to hold the key and value as a last - * chance before GC hook that can effect the update of diskMap. - *

- * Entries are not recycled. - */ - private static class SoftEntry extends SoftReference { - PhantomEntry phantom; - - public SoftEntry(String key, V referent, ReferenceQueue q) { - super(referent, q); - this.phantom = new PhantomEntry(key, referent); - } - - public V get() { - // ensure visibility - synchronized (this) { - return super.get(); - } - } - - public String toString() { - if (phantom != null) { - return "SoftEntry(key=" + phantom.key + ")"; - } else { - return "SoftEntry()"; - } - } - } - - // - // Crude, probably unreliable/fragile but harmless mechanism to - // trigger expunge of cleared SoftReferences in low-memory - // conditions even without any of the other get/put triggers. - // - - protected transient SoftReference canary; - protected class LowMemoryCanary { - /** When collected/finalized -- as should be expected in - * low-memory conditions -- trigger an expunge and a - * new 'canary' insertion. */ - public void finalize() { - ObjectIdentityBdbCache.this.pageOutStaleEntries(); -// System.err.println("CANARY KILLED - "+ObjectIdentityBdbCache.this); - // only install new canary if map still 'open' with db reference - if(ObjectIdentityBdbCache.this.db !=null) { - ObjectIdentityBdbCache.this.canary = - new SoftReference(new LowMemoryCanary()); - } else { - ObjectIdentityBdbCache.this.canary = null; - } - } - } -} diff --git a/commons/src/test/java/org/archive/util/ObjectIdentityBdbCacheTest.java b/commons/src/test/java/org/archive/util/ObjectIdentityBdbCacheTest.java deleted file mode 100644 index 1eb6e4943..000000000 --- a/commons/src/test/java/org/archive/util/ObjectIdentityBdbCacheTest.java +++ /dev/null @@ -1,199 +0,0 @@ -/* - * This file is part of the Heritrix web crawler (crawler.archive.org). - * - * Licensed to the Internet Archive (IA) by one or more individual - * contributors. - * - * The IA licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.archive.util; - -import java.io.File; -import java.util.HashMap; -import java.util.concurrent.atomic.AtomicInteger; - -import org.apache.commons.io.FileUtils; -import org.apache.commons.lang.math.RandomUtils; -import org.archive.util.bdbje.EnhancedEnvironment; - -/** - * @author stack - * @author gojomo - * @version $Date: 2009-08-03 23:50:43 -0700 (Mon, 03 Aug 2009) $, $Revision: 6434 $ - */ -public class ObjectIdentityBdbCacheTest extends TmpDirTestCase { - EnhancedEnvironment env; - private ObjectIdentityBdbCache>> cache; - - protected void setUp() throws Exception { - super.setUp(); - File envDir = new File(getTmpDir(),"ObjectIdentityBdbCacheTest"); - org.archive.util.FileUtils.ensureWriteableDirectory(envDir); - FileUtils.deleteDirectory(envDir); - org.archive.util.FileUtils.ensureWriteableDirectory(envDir); - env = EnhancedEnvironment.getTestEnvironment(envDir); - this.cache = new ObjectIdentityBdbCache>>(); - this.cache.initialize(env,"setUpCache",IdentityCacheableWrapper.class, env.getClassCatalog()); - } - - protected void tearDown() throws Exception { - this.cache.close(); - File envDir = env.getHome(); - env.close(); - FileUtils.deleteDirectory(envDir); - super.tearDown(); - } - - @SuppressWarnings("unchecked") - public void testReadConsistencyUnderLoad() throws Exception { - final ObjectIdentityBdbCache> cbdbmap = - new ObjectIdentityBdbCache(); - cbdbmap.initialize(env, - "consistencyCache", - IdentityCacheableWrapper.class, - env.getClassCatalog()); - try { - final AtomicInteger level = new AtomicInteger(0); - final int keyCount = 128 * 1024; // 128K keys - final int maxLevel = 64; - // initial fill - for(int i=0; i < keyCount; i++) { - final String key = ""+i; - cbdbmap.getOrUse( - key, - new Supplier>( - new IdentityCacheableWrapper( - key, new AtomicInteger(level.get())))); - } - // backward checking that all values always at level or higher - new Thread() { - public void run() { - untilmax: while(true) { - for(int j=keyCount-1; j >= 0; j--) { - int targetValue = level.get(); - if(targetValue>=maxLevel) { - break untilmax; - } - assertTrue("stale value revseq key "+j,cbdbmap.get(""+j).get().get()>=targetValue); - Thread.yield(); - } - } - } - };//.start(); - // random checking that all values always at level or higher - new Thread() { - public void run() { - untilmax: while(true) { - int j = RandomUtils.nextInt(keyCount); - int targetValue = level.get(); - if(targetValue>=maxLevel) { - break untilmax; - } - assertTrue("stale value random key "+j, - cbdbmap.get(""+j).get().get()>=targetValue); - Thread.yield(); - } - } - };//.start(); - // increment all keys - for(; level.get() < maxLevel; level.incrementAndGet()) { - for(int k = 0; k < keyCount; k++) { - int foundValue = cbdbmap.get(""+k).get().getAndIncrement(); - assertEquals("stale value preinc key "+k, level.get(), foundValue); - } - if(level.get() % 10 == 0) { - System.out.println("level to "+level.get()); - if(level.get()>0) { - TestUtils.forceScarceMemory(); - } - System.out.println("OIBCT:"+cbdbmap.composeCacheSummary()); - } - Thread.yield(); - } - } finally { - System.err.println("OIBCT:"+cbdbmap.composeCacheSummary()); - cbdbmap.close(); - } - // SUCCESS - } - - public void testBackingDbGetsUpdated() { - // Set up values. - final String value = "value"; - final String key = "key"; - final int upperbound = 3; - // First put in empty hashmap. - for (int i = 0; i < upperbound; i++) { - String innerKey = key + Integer.toString(i); - this.cache.getOrUse( - innerKey, - new Supplier>>( - new IdentityCacheableWrapper>( - innerKey, new HashMap()))); - } - // Now add value to hash map. - for (int i = 0; i < upperbound; i++) { - HashMap m = this.cache.get(key + Integer.toString(i)).get(); - m.put(key, value); - } - this.cache.sync(); - for (int i = 0; i < upperbound; i++) { - HashMap m = this.cache.get(key + Integer.toString(i)).get(); - String v = m.get(key); - assertNotNull("value should not be null",v); - assertEquals("value incorrect", value, v); - } - } - - /** - * Test that in scarce memory conditions, the memory map is - * expunged of otherwise unreferenced entries as expected. - * - * NOTE: this test may be especially fragile with regard to - * GC/timing issues; relies on timely finalization, which is - * never guaranteed by JVM/GC. For example, it is so sensitive - * to CPU speed that a Thread.sleep(1000) succeeds when my - * laptop is plugged in, but fails when it is on battery! - * - * @throws InterruptedException - */ - public void testMemMapCleared() throws InterruptedException { - TestUtils.forceScarceMemory(); - System.gc(); // minimize effects of earlier test heap use - assertEquals(0, cache.memMap.size()); - assertEquals(0, cache.diskMap.size()); - for(int i=0; i < 10000; i++) { - String key = ""+i; - cache.getOrUse( - key, - new Supplier>>( - new IdentityCacheableWrapper>( - key, new HashMap()))); - } - assertEquals(cache.memMap.size(), 10000); - assertEquals(cache.size(), 10000); - TestUtils.forceScarceMemory(); - Thread.sleep(3000); - // The 'canary' trick may make this explicit page-out, or - // a page-out riggered by a get() or put...(), unnecessary -- - // but we include anyway. - cache.pageOutStaleEntries(); - System.out.println(cache.size()+","+cache.memMap.size()); - assertEquals("memMap not cleared", 0, cache.memMap.size()); - } - - - public static void main(String [] args) { - junit.textui.TestRunner.run(ObjectIdentityBdbCacheTest.class); - } -} From d3d19e224895df802653aff56cf48e4cd59ce4a4 Mon Sep 17 00:00:00 2001 From: Alex Osborne Date: Fri, 2 Aug 2019 17:57:09 +0900 Subject: [PATCH 019/123] Prevent newer surefire from invoking FetchHTTPTests directly It seems older versions only matched *Test.java but the new version is now incorrectly including it. --- modules/pom.xml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/modules/pom.xml b/modules/pom.xml index 6ea632ad2..0115535c4 100644 --- a/modules/pom.xml +++ b/modules/pom.xml @@ -80,6 +80,9 @@ **/TestAll.java + + **/*Tests.java From 9d1341c731f1cb747128ad7534ce553b0736bee7 Mon Sep 17 00:00:00 2001 From: Alex Osborne Date: Mon, 5 Aug 2019 12:57:31 +0900 Subject: [PATCH 020/123] JDK11 support: upgrade jetty to 9.4.19 in modules tests Fixes #268 ssl handshake_failure. Support for jdk11 was added in jetty 9.4.14 but we may as well bump to the latest stable version. In Jetty 9 it appears the request is logged after the response is sent. Thus it was racing with the assertions that check the client IP. So to fix this rather stashing the 'lastRequest' we just make the server echo the client's IP in a response header. Some other minor tweaks were needed due to changes in Jetty behaviour: - We stop checking the length of the raw response. It doesn't tell us anything and easily varies. - Jetty now generates Set-Cookie with a space after the ; - Jetty now lowercases the word "basic" in WWW-Authenticate header - testLaxUrlEncoding(): Jetty now rejects bad paths with a 400 error so we disable the response checks. The actual request line is still checked which is the important thing. Note: This patch does not affect the version of jetty used by the Heritrix admin console. That will be tackled separately. --- modules/pom.xml | 20 +--- .../CookieFetchHTTPIntegrationTest.java | 40 ++++--- .../modules/fetcher/FetchHTTPTest.java | 104 ++++++++---------- .../modules/fetcher/FetchHTTPTests.java | 12 +- .../recrawl/ContentDigestHistoryTest.java | 20 ++-- 5 files changed, 86 insertions(+), 110 deletions(-) diff --git a/modules/pom.xml b/modules/pom.xml index d38cd346a..a9d777e80 100644 --- a/modules/pom.xml +++ b/modules/pom.xml @@ -35,23 +35,15 @@ 1.6.3 - org.mortbay.jetty - jetty-util - 6.1.26 - test + org.eclipse.jetty + jetty-server + 9.4.19.v20190610 - org.mortbay.jetty - jetty-sslengine - 6.1.26 - test + org.eclipse.jetty + jetty-security + 9.4.19.v20190610 - - org.mortbay.jetty - jetty - 6.1.26 - test - org.littleshoot littleproxy diff --git a/modules/src/test/java/org/archive/modules/fetcher/CookieFetchHTTPIntegrationTest.java b/modules/src/test/java/org/archive/modules/fetcher/CookieFetchHTTPIntegrationTest.java index 4ea94d22c..257c85024 100644 --- a/modules/src/test/java/org/archive/modules/fetcher/CookieFetchHTTPIntegrationTest.java +++ b/modules/src/test/java/org/archive/modules/fetcher/CookieFetchHTTPIntegrationTest.java @@ -43,18 +43,17 @@ import org.archive.spring.ConfigPath; import org.archive.util.KeyTool; import org.archive.util.TmpDirTestCase; -import org.mortbay.jetty.Request; -import org.mortbay.jetty.Server; -import org.mortbay.jetty.bio.SocketConnector; -import org.mortbay.jetty.security.SslSocketConnector; -import org.mortbay.jetty.servlet.SessionHandler; -import org.mortbay.log.Log; import com.google.common.io.Files; import junit.extensions.TestSetup; import junit.framework.Test; import junit.framework.TestSuite; +import org.eclipse.jetty.http.HttpVersion; +import org.eclipse.jetty.server.*; +import org.eclipse.jetty.server.session.SessionHandler; +import org.eclipse.jetty.util.log.Log; +import org.eclipse.jetty.util.ssl.SslContextFactory; public class CookieFetchHTTPIntegrationTest extends ProcessorTestBase { @@ -64,9 +63,7 @@ public TestHandler() { } @Override - public void handle(String target, HttpServletRequest request, - HttpServletResponse response, int dispatch) throws IOException, - ServletException { + public void doHandle(String target, Request baseRequest, HttpServletRequest request, HttpServletResponse response) throws IOException, ServletException { if (request.getParameter("name") != null) { Cookie cookie = new javax.servlet.http.Cookie(request.getParameter("name"), request.getParameter("value")); @@ -114,7 +111,7 @@ public static Server startHttpServer() throws Exception { server.setHandler(new TestHandler()); - SocketConnector sc = new SocketConnector(); + ServerConnector sc = new ServerConnector(server); sc.setHost("127.0.0.1"); sc.setPort(7777); @@ -133,11 +130,18 @@ public static Server startHttpServer() throws Exception { "-dname", "CN=127.0.0.1", "-validity","3650"}); // 10 yr validity - SslSocketConnector ssc = new SslSocketConnector(); + SslContextFactory sslContextFactory = new SslContextFactory(); + sslContextFactory.setKeyStorePassword(KEYSTORE_PASSWORD); + sslContextFactory.setKeyStorePath(keystoreFile.getPath()); + + HttpConfiguration httpsConfig = new HttpConfiguration(); + httpsConfig.addCustomizer(new SecureRequestCustomizer()); + + ServerConnector ssc = new ServerConnector(server, + new SslConnectionFactory(sslContextFactory, HttpVersion.HTTP_1_1.asString()), + new HttpConnectionFactory(httpsConfig)); ssc.setHost("127.0.0.1"); ssc.setPort(7443); - ssc.setKeyPassword(KEYSTORE_PASSWORD); - ssc.setKeystore(keystoreFile.getPath()); server.addConnector(sc); server.addConnector(ssc); @@ -345,7 +349,7 @@ protected void testExplicitDomain(AbstractCookieStore cookieStore) throws URIExc CrawlURI curi = makeCrawlURI("http://example.com:7777/?name=foo&value=bar&domain=example.com"); fetcher().process(curi); assertFalse(FetchHTTPTests.httpRequestString(curi).toLowerCase().contains("cookie:")); - assertTrue(FetchHTTPTests.rawResponseString(curi).contains("Set-Cookie: foo=bar;Domain=example.com\r\n")); + assertTrue(FetchHTTPTests.rawResponseString(curi).contains("Set-Cookie: foo=bar; Domain=example.com\r\n")); // check second fetch has expected cookie curi = makeCrawlURI("http://example.com:7777/"); @@ -379,7 +383,7 @@ protected void testExplicitDomainWithLeadingDot(AbstractCookieStore cookieStore) CrawlURI curi = makeCrawlURI("http://example.com:7777/?name=foo&value=bar&domain=.example.com"); fetcher().process(curi); assertFalse(FetchHTTPTests.httpRequestString(curi).toLowerCase().contains("cookie:")); - assertTrue(FetchHTTPTests.rawResponseString(curi).contains("Set-Cookie: foo=bar;Domain=.example.com\r\n")); + assertTrue(FetchHTTPTests.rawResponseString(curi).contains("Set-Cookie: foo=bar; Domain=.example.com\r\n")); // check second fetch has expected cookie curi = makeCrawlURI("http://example.com:7777/"); @@ -413,7 +417,7 @@ protected void testRejectDomain(AbstractCookieStore cookieStore) throws URIExcep CrawlURI curi = makeCrawlURI("http://example.com:7777/?name=foo&value=bar&domain=somethingelse.com"); fetcher().process(curi); assertFalse(FetchHTTPTests.httpRequestString(curi).toLowerCase().contains("cookie:")); - assertTrue(FetchHTTPTests.rawResponseString(curi).contains("Set-Cookie: foo=bar;Domain=somethingelse.com\r\n")); + assertTrue(FetchHTTPTests.rawResponseString(curi).contains("Set-Cookie: foo=bar; Domain=somethingelse.com\r\n")); // check fetch of original domain has no cookie curi = makeCrawlURI("http://example.com:7777/"); @@ -434,7 +438,7 @@ protected void testRejectDomain(AbstractCookieStore cookieStore) throws URIExcep curi = makeCrawlURI("http://FOO.example.com:7777/?name=foo&value=bar&domain=BAR.example.com"); fetcher().process(curi); assertFalse(FetchHTTPTests.httpRequestString(curi).toLowerCase().contains("cookie:")); - assertTrue(FetchHTTPTests.rawResponseString(curi).contains("Set-Cookie: foo=bar;Domain=bar.example.com\r\n")); + assertTrue(FetchHTTPTests.rawResponseString(curi).contains("Set-Cookie: foo=bar; Domain=bar.example.com\r\n")); // check fetch of original domain has no cookie curi = makeCrawlURI("http://foo.example.com:7777/"); @@ -463,7 +467,7 @@ protected void testSubdomainParentDomain(AbstractCookieStore cookieStore) throws CrawlURI curi = makeCrawlURI("http://FOO.example.com:7777/?name=foo&value=bar&domain=example.com"); fetcher().process(curi); assertFalse(FetchHTTPTests.httpRequestString(curi).toLowerCase().contains("cookie:")); - assertTrue(FetchHTTPTests.rawResponseString(curi).contains("Set-Cookie: foo=bar;Domain=example.com\r\n")); + assertTrue(FetchHTTPTests.rawResponseString(curi).contains("Set-Cookie: foo=bar; Domain=example.com\r\n")); curi = makeCrawlURI("http://FOO.example.com:7777/"); fetcher().process(curi); diff --git a/modules/src/test/java/org/archive/modules/fetcher/FetchHTTPTest.java b/modules/src/test/java/org/archive/modules/fetcher/FetchHTTPTest.java index 0b159abec..08b7c4a63 100644 --- a/modules/src/test/java/org/archive/modules/fetcher/FetchHTTPTest.java +++ b/modules/src/test/java/org/archive/modules/fetcher/FetchHTTPTest.java @@ -30,27 +30,21 @@ import org.archive.modules.ProcessorTestBase; import org.archive.util.KeyTool; import org.archive.util.TmpDirTestCase; -import org.mortbay.jetty.NCSARequestLog; -import org.mortbay.jetty.Request; -import org.mortbay.jetty.Response; -import org.mortbay.jetty.Server; -import org.mortbay.jetty.bio.SocketConnector; -import org.mortbay.jetty.handler.HandlerCollection; -import org.mortbay.jetty.handler.RequestLogHandler; -import org.mortbay.jetty.security.Authenticator; -import org.mortbay.jetty.security.BasicAuthenticator; -import org.mortbay.jetty.security.Constraint; -import org.mortbay.jetty.security.ConstraintMapping; -import org.mortbay.jetty.security.DigestAuthenticator; -import org.mortbay.jetty.security.HashUserRealm; -import org.mortbay.jetty.security.SecurityHandler; -import org.mortbay.jetty.security.SslSocketConnector; -import org.mortbay.jetty.servlet.SessionHandler; -import org.mortbay.log.Log; import junit.extensions.TestSetup; import junit.framework.Test; import junit.framework.TestSuite; +import org.eclipse.jetty.http.HttpVersion; +import org.eclipse.jetty.security.*; +import org.eclipse.jetty.security.authentication.BasicAuthenticator; +import org.eclipse.jetty.security.authentication.DigestAuthenticator; +import org.eclipse.jetty.server.*; +import org.eclipse.jetty.server.handler.HandlerCollection; +import org.eclipse.jetty.server.session.SessionHandler; +import org.eclipse.jetty.util.log.Log; +import org.eclipse.jetty.util.security.Constraint; +import org.eclipse.jetty.util.security.Password; +import org.eclipse.jetty.util.ssl.SslContextFactory; public class FetchHTTPTest extends ProcessorTestBase { @@ -125,12 +119,13 @@ protected static class TestHandler extends SessionHandler { public TestHandler() { super(); } - + @Override - public void handle(String target, HttpServletRequest request, - HttpServletResponse response, int dispatch) throws IOException, - ServletException { - + public void doHandle(String target, Request baseRequest, HttpServletRequest request, HttpServletResponse response) throws IOException, ServletException { + + // echo the remote host back to the client so tests can reference it + response.setHeader("Client-Host", request.getRemoteHost()); + if (target.endsWith("/set-cookie")) { response.addCookie(new javax.servlet.http.Cookie("test-cookie-name", "test-cookie-value")); } @@ -221,9 +216,7 @@ public void handle(String target, HttpServletRequest request, } protected static Map httpServers; - protected static Request lastRequest = null; - protected static Response lastResponse = null; - + protected static SecurityHandler makeAuthWrapper(Authenticator authenticator, final String role, String realm, final String login, final String password) { @@ -235,16 +228,15 @@ protected static SecurityHandler makeAuthWrapper(Authenticator authenticator, constraintMapping.setConstraint(constraint); constraintMapping.setPathSpec("/auth/*"); - SecurityHandler authWrapper = new SecurityHandler(); + ConstraintSecurityHandler authWrapper = new ConstraintSecurityHandler(); authWrapper.setAuthenticator(authenticator); authWrapper.setConstraintMappings(new ConstraintMapping[] {constraintMapping}); - authWrapper.setUserRealm(new HashUserRealm(realm) { - { - put(login, password); - addUserToRole(login, role); - } - }); + UserStore userStore = new UserStore(); + userStore.addUser(login, new Password(password), new String[] {role}); + HashLoginService loginService = new HashLoginService(realm); + loginService.setUserStore(userStore); + authWrapper.setLoginService(loginService); return authWrapper; } @@ -259,24 +251,10 @@ public static Map startHttpServers() throws Exception { HashMap servers = new HashMap(); - HandlerCollection handlers = new HandlerCollection(); - handlers.addHandler(new TestHandler()); - RequestLogHandler requestLogHandler = new RequestLogHandler(); - NCSARequestLog requestLog = new NCSARequestLog() { - @Override - public void log(Request request, Response response) { - super.log(request, response); - lastRequest = request; - lastResponse = response; - } - }; - requestLogHandler.setRequestLog(requestLog); - handlers.addHandler(requestLogHandler); - // server for basic auth Server server = new Server(); - SocketConnector sc = new SocketConnector(); + ServerConnector sc = new ServerConnector(server); sc.setHost("127.0.0.1"); sc.setPort(7777); server.addConnector(sc); @@ -284,6 +262,8 @@ public void log(Request request, Response response) { SecurityHandler authWrapper = makeAuthWrapper(new BasicAuthenticator(), BASIC_AUTH_ROLE, BASIC_AUTH_REALM, BASIC_AUTH_LOGIN, BASIC_AUTH_PASSWORD); + HandlerCollection handlers = new HandlerCollection(); + handlers.addHandler(new TestHandler()); authWrapper.setHandler(handlers); server.setHandler(authWrapper); @@ -299,16 +279,23 @@ public void log(Request request, Response response) { "-storepass", KEYSTORE_PASSWORD, "-keypass", KEYSTORE_PASSWORD, "-alias", "jetty", - "-genkey", + "-genkey", "-keyalg", "RSA", "-dname", "CN=127.0.0.1", "-validity","3650"}); // 10 yr validity - - SslSocketConnector ssc = new SslSocketConnector(); + + SslContextFactory sslContextFactory = new SslContextFactory(); + sslContextFactory.setKeyStorePassword(KEYSTORE_PASSWORD); + sslContextFactory.setKeyStorePath(keystoreFile.getPath()); + + HttpConfiguration httpsConfig = new HttpConfiguration(); + httpsConfig.addCustomizer(new SecureRequestCustomizer()); + + ServerConnector ssc = new ServerConnector(server, + new SslConnectionFactory(sslContextFactory, HttpVersion.HTTP_1_1.asString()), + new HttpConnectionFactory(httpsConfig)); ssc.setHost("127.0.0.1"); ssc.setPort(7443); - ssc.setKeyPassword(KEYSTORE_PASSWORD); - ssc.setKeystore(keystoreFile.getPath()); server.addConnector(ssc); @@ -317,7 +304,7 @@ public void log(Request request, Response response) { // server for digest auth server = new Server(); - sc = new SocketConnector(); + sc = new ServerConnector(server); sc.setHost("127.0.0.1"); sc.setPort(7778); server.addConnector(sc); @@ -325,7 +312,9 @@ public void log(Request request, Response response) { authWrapper = makeAuthWrapper(new DigestAuthenticator(), DIGEST_AUTH_ROLE, DIGEST_AUTH_REALM, DIGEST_AUTH_LOGIN, DIGEST_AUTH_PASSWORD); - authWrapper.setHandler(handlers); + HandlerCollection handlers2 = new HandlerCollection(); + handlers2.addHandler(new TestHandler()); + authWrapper.setHandler(handlers2); server.setHandler(authWrapper); server.start(); @@ -333,7 +322,7 @@ public void log(Request request, Response response) { return servers; } - + protected static void ensureHttpServers() throws Exception { if (httpServers == null) { httpServers = startHttpServers(); @@ -362,9 +351,4 @@ protected void tearDown() throws Exception { } }; } - - public static Request getLastRequest() { - return lastRequest; - } - } diff --git a/modules/src/test/java/org/archive/modules/fetcher/FetchHTTPTests.java b/modules/src/test/java/org/archive/modules/fetcher/FetchHTTPTests.java index 43de8044e..6ca6728cf 100644 --- a/modules/src/test/java/org/archive/modules/fetcher/FetchHTTPTests.java +++ b/modules/src/test/java/org/archive/modules/fetcher/FetchHTTPTests.java @@ -157,7 +157,7 @@ protected void runDefaultChecks(CrawlURI curi, String... exclusionsArray) assertEquals(DEFAULT_PAYLOAD_STRING, curi.getRecorder().getContentReplayCharSequence().toString()); if (!exclusions.contains("httpBindAddress")) { - assertEquals("127.0.0.1", FetchHTTPTest.getLastRequest().getRemoteAddr()); + assertTrue(rawResponseString(curi).contains("Client-Host: 127.0.0.1\r\n")); } if (!exclusions.contains("nonFatalFailuresIsEmpty")) { @@ -262,7 +262,7 @@ public void testBasicAuth() throws Exception { // check that we got the expected response and the fetcher did its thing assertEquals(401, curi.getFetchStatus()); - assertEquals("Basic realm=\"basic-auth-realm\"", curi.getHttpResponseHeader("WWW-Authenticate")); + assertEquals("basic realm=\"basic-auth-realm\"", curi.getHttpResponseHeader("WWW-Authenticate")); assertTrue(curi.getCredentials().contains(basicAuthCredential)); assertTrue(curi.getHttpAuthChallenges() != null && curi.getHttpAuthChallenges().containsKey("basic")); @@ -406,8 +406,8 @@ public void tryHttpBindAddress(String addr) throws Exception { fetcher().process(curi); // the client bind address isn't recorded anywhere in heritrix as - // far as i can tell, so we get it this way... - assertEquals(addr, FetchHTTPTest.getLastRequest().getRemoteAddr()); + // far as i can tell, so we get the server to echo it back to us... + assertTrue(rawResponseString(curi).contains("Client-Host: " + addr + "\r\n")); runDefaultChecks(curi, "httpBindAddress"); } @@ -780,7 +780,9 @@ public void testLaxUrlEncoding() throws Exception { fetcher().process(curi); // logger.info('\n' + httpRequestString(curi) + "\n\n" + rawResponseString(curi)); assertTrue(httpRequestString(curi).startsWith("GET /99% HTTP/1.0\r\n")); - runDefaultChecks(curi, "requestLine"); + // jetty 9 rejects requests with paths like this with 400 Bad Request + // so we can't run these checks anymore + //runDefaultChecks(curi, "requestLine"); } public void testTwoQuestionMarks() throws Exception { diff --git a/modules/src/test/java/org/archive/modules/recrawl/ContentDigestHistoryTest.java b/modules/src/test/java/org/archive/modules/recrawl/ContentDigestHistoryTest.java index ea2980e06..5e40d6642 100644 --- a/modules/src/test/java/org/archive/modules/recrawl/ContentDigestHistoryTest.java +++ b/modules/src/test/java/org/archive/modules/recrawl/ContentDigestHistoryTest.java @@ -70,11 +70,11 @@ import org.archive.util.Base32; import org.archive.util.Recorder; import org.archive.util.TmpDirTestCase; -import org.mortbay.jetty.Request; -import org.mortbay.jetty.Server; -import org.mortbay.jetty.bio.SocketConnector; -import org.mortbay.jetty.handler.HandlerCollection; -import org.mortbay.jetty.servlet.SessionHandler; +import org.eclipse.jetty.server.Request; +import org.eclipse.jetty.server.Server; +import org.eclipse.jetty.server.ServerConnector; +import org.eclipse.jetty.server.handler.HandlerCollection; +import org.eclipse.jetty.server.session.SessionHandler; public class ContentDigestHistoryTest extends TmpDirTestCase { @@ -237,7 +237,6 @@ public void testWarcDedupe() throws Exception { fetcher.process(curi1); assertEquals(200, curi1.getFetchStatus()); - assertEquals(141, curi1.getContentSize()); assertEquals(expectedDigest, curi1.getContentDigestSchemeString()); assertFalse(curi1.hasContentDigestHistory()); @@ -260,7 +259,6 @@ public void testWarcDedupe() throws Exception { fetcher.process(curi2); assertEquals(200, curi1.getFetchStatus()); - assertEquals(141, curi1.getContentSize()); assertEquals(expectedDigest, curi1.getContentDigestSchemeString()); assertFalse(curi2.hasContentDigestHistory()); @@ -305,7 +303,6 @@ public void testWarcDedupe() throws Exception { assertTrue(recordIterator.hasNext()); record = recordIterator.next(); assertEquals(WARCRecordType.response.toString(), record.getHeader().getHeaderValue(HEADER_KEY_TYPE)); - assertEquals("141", record.getHeader().getHeaderValue(CONTENT_LENGTH)); assertEquals(expectedDigest, record.getHeader().getHeaderValue(HEADER_KEY_PAYLOAD_DIGEST)); assertEquals(curi1.getUURI().toString(), record.getHeader().getHeaderValue(HEADER_KEY_URI)); assertEquals(payloadRecordIdWithBrackets, record.getHeader().getHeaderValue(HEADER_KEY_ID)); @@ -374,10 +371,7 @@ protected Server newHttpServer() throws Exception { HandlerCollection handlers = new HandlerCollection(); handlers.addHandler(new SessionHandler(){ @Override - public void handle(String target, HttpServletRequest request, - HttpServletResponse response, int dispatch) throws IOException, - ServletException { - + public void doHandle(String target, Request baseRequest, HttpServletRequest request, HttpServletResponse response) throws IOException, ServletException { response.setContentType("text/plain;charset=US-ASCII"); response.setStatus(HttpServletResponse.SC_OK); response.getOutputStream().write(DEFAULT_PAYLOAD_STRING.getBytes("US-ASCII")); @@ -388,7 +382,7 @@ public void handle(String target, HttpServletRequest request, Server server = new Server(); server.setHandler(handlers); - SocketConnector sc = new SocketConnector(); + ServerConnector sc = new ServerConnector(server); sc.setHost("127.0.0.1"); sc.setPort(7777); server.addConnector(sc); From 0782fb0481d38054f6530b9d3ef8a8745888445f Mon Sep 17 00:00:00 2001 From: Alex Osborne Date: Tue, 6 Aug 2019 15:10:48 +0900 Subject: [PATCH 021/123] JDK11: upgrade engine to jetty 9.4.19 and restlet 2.4.0 Jetty 9.4.12+ is required for TLS to work correctly under JDK11 (due to SSL handshake failures). In order to upgrade Jetty we also need to upgrade Restlet. There is one intentional change in behaviour to simplify upgrading. We remove a workaround for an old [webkit bug] where the browser claimed to prefer application/xml. The bug was fixed in 2011. [webkit bug]: https://bugs.webkit.org/show_bug.cgi?id=27267 Summary of Jetty API changes: - package names changed such as org.mortbay -> org.eclipse - SocketConnector and SslSocketConnector merged to ServerConnector - HashUserRealm split into UserStore and HashLoginService - SecurityHandler -> ConstraintSecurityHandler Summary of Restlet API changes: - some classes have moved package (Request, Response, Router etc) - ServerResource replaces Resource - represent(), acceptRepresentation() renamed to get(), post() - constructors were replaced by an init() method - setModifiable() was removed - getRequest().getEntityAsForm() -> new Form(entity) - Guard -> ChallengeAuthenticator Fixes #275 --- commons/src/test/resources/log4j.xml | 2 +- contrib/src/main/resources/log4j.xml | 2 +- contrib/src/test/resources/log4j.xml | 2 +- dist/src/main/conf/logging.properties | 2 +- dist/src/test/resources/log4j.xml | 2 +- engine/pom.xml | 60 ++++++++++--------- .../java/org/archive/crawler/Heritrix.java | 33 ++++++---- .../archive/crawler/restlet/BaseResource.java | 45 +------------- .../crawler/restlet/BeanBrowseResource.java | 37 ++++++------ .../crawler/restlet/EditRepresentation.java | 8 +-- .../crawler/restlet/EngineApplication.java | 19 +++--- .../crawler/restlet/EngineResource.java | 37 +++++++----- .../archive/crawler/restlet/EnhDirectory.java | 38 +++++++----- .../crawler/restlet/EnhDirectoryResource.java | 47 ++++++--------- .../org/archive/crawler/restlet/Flash.java | 4 +- .../crawler/restlet/JobRelatedResource.java | 23 +++---- .../archive/crawler/restlet/JobResource.java | 29 +++++---- .../crawler/restlet/PagedRepresentation.java | 8 +-- .../crawler/restlet/RateLimitGuard.java | 24 ++++---- .../crawler/restlet/ReportGenResource.java | 24 ++++---- .../crawler/restlet/ScriptResource.java | 31 +++++----- .../crawler/restlet/XmlMarshaller.java | 2 +- .../crawler/selftest/CheckpointSelfTest.java | 10 ++-- .../crawler/selftest/FormAuthSelfTest.java | 23 +++---- .../crawler/selftest/FormLoginSelfTest.java | 24 ++++---- .../crawler/selftest/HttpAuthSelfTest.java | 41 +++++++------ .../crawler/selftest/SelfTestBase.java | 14 ++--- .../crawler/selftest/UserAgentSelfTest.java | 22 +++---- .../archive/modules/fetcher/FormAuthTest.java | 60 +++++++++---------- engine/src/test/resources/log4j.xml | 2 +- modules/src/test/resources/log4j.xml | 2 +- 31 files changed, 327 insertions(+), 350 deletions(-) diff --git a/commons/src/test/resources/log4j.xml b/commons/src/test/resources/log4j.xml index 475b130e2..e04f34ab8 100644 --- a/commons/src/test/resources/log4j.xml +++ b/commons/src/test/resources/log4j.xml @@ -11,7 +11,7 @@ - + diff --git a/contrib/src/main/resources/log4j.xml b/contrib/src/main/resources/log4j.xml index 475b130e2..e04f34ab8 100644 --- a/contrib/src/main/resources/log4j.xml +++ b/contrib/src/main/resources/log4j.xml @@ -11,7 +11,7 @@ - + diff --git a/contrib/src/test/resources/log4j.xml b/contrib/src/test/resources/log4j.xml index 475b130e2..e04f34ab8 100644 --- a/contrib/src/test/resources/log4j.xml +++ b/contrib/src/test/resources/log4j.xml @@ -11,7 +11,7 @@ - + diff --git a/dist/src/main/conf/logging.properties b/dist/src/main/conf/logging.properties index f83723961..094cb1762 100644 --- a/dist/src/main/conf/logging.properties +++ b/dist/src/main/conf/logging.properties @@ -5,7 +5,7 @@ # ...and even less from the too-chatty-with-WARNINGs HttpClient library... org.apache.commons.httpclient.level = SEVERE org.restlet.Component.LogFilter.level = SEVERE -org.mortbay.log.level = SEVERE +org.eclipse.jetty.log.level = SEVERE # ...but INFO for our classes, which reserve FINE/FINER/FINEST for bulk/trivia... org.archive.level = INFO diff --git a/dist/src/test/resources/log4j.xml b/dist/src/test/resources/log4j.xml index 475b130e2..e04f34ab8 100644 --- a/dist/src/test/resources/log4j.xml +++ b/dist/src/test/resources/log4j.xml @@ -11,7 +11,7 @@ - + diff --git a/engine/pom.xml b/engine/pom.xml index 14410e9ea..909bdeb48 100644 --- a/engine/pom.xml +++ b/engine/pom.xml @@ -25,44 +25,50 @@ ${project.version} compile + - org.mortbay.jetty - jetty - 6.1.26 - compile + org.eclipse.jetty + jetty-server + 9.4.19.v20190610 - org.mortbay.jetty + org.eclipse.jetty jetty-util - 6.1.26 - compile + 9.4.19.v20190610 - org.mortbay.jetty - jetty-sslengine - 6.1.26 - compile - - - org.mortbay.jetty - jetty-ajp - 6.1.26 - compile - + org.eclipse.jetty + jetty-servlet + 9.4.19.v20190610 + test + + + org.restlet.jse + org.restlet + 2.4.0 + - org.restlet - org.restlet - 1.1.10 + org.restlet.jse + org.restlet.ext.jetty + 2.4.0 + + + org.eclipse.jetty + jetty-client + + - com.noelios.restlet - com.noelios.restlet - 1.1.10 + org.restlet.jse + org.restlet.ext.xml + 2.4.0 - com.noelios.restlet - com.noelios.restlet.ext.jetty - 1.1.10 + + org.restlet.jse + org.restlet.ext.crypto + 2.4.0 joda-time diff --git a/engine/src/main/java/org/archive/crawler/Heritrix.java b/engine/src/main/java/org/archive/crawler/Heritrix.java index 114950722..a737f1d78 100644 --- a/engine/src/main/java/org/archive/crawler/Heritrix.java +++ b/engine/src/main/java/org/archive/crawler/Heritrix.java @@ -57,10 +57,11 @@ import org.archive.util.ArchiveUtils; import org.archive.util.KeyTool; import org.restlet.Component; -import org.restlet.Guard; import org.restlet.Server; import org.restlet.data.ChallengeScheme; import org.restlet.data.Protocol; +import org.restlet.security.ChallengeAuthenticator; +import org.restlet.security.MapVerifier; /** @@ -212,7 +213,7 @@ public void instanceMain(String[] args) "mailto, clsid, res, file, rtsp, about"); } - String maxFormSize = "org.mortbay.jetty.Request.maxFormContentSize"; + String maxFormSize = "org.eclipse.jetty.server.Request.maxFormContentSize"; if (System.getProperty(maxFormSize) == null) { System.setProperty(maxFormSize, "52428800"); } @@ -334,24 +335,30 @@ public void instanceMain(String[] args) try { engine = new Engine(jobsDir); component = new Component(); - + if(bindHosts.isEmpty()) { // listen all addresses - setupServer(port, null, keystorePath, keystorePassword, keyPassword); + setupServer(component, port, null, keystorePath, keystorePassword, keyPassword); } else { // bind only to declared addresses, or just 'localhost' for(String address : bindHosts) { - setupServer(port, address, keystorePath, keystorePassword, keyPassword); + setupServer(component, port, address, keystorePath, keystorePassword, keyPassword); } } component.getClients().add(Protocol.FILE); - component.getClients().add(Protocol.CLAP); - Guard guard = new RateLimitGuard(null, + component.getClients().add(Protocol.CLAP); + + MapVerifier verifier = new MapVerifier(); + verifier.getLocalSecrets().put(authLogin, authPassword.toCharArray()); + + ChallengeAuthenticator guard = new RateLimitGuard(component.getContext(), ChallengeScheme.HTTP_DIGEST, "Authentication Required"); - guard.getSecrets().put(authLogin, authPassword.toCharArray()); - component.getDefaultHost().attach(guard); + guard.setVerifier(verifier); guard.setNext(new EngineApplication(engine)); + + component.getDefaultHost().attach(guard); component.start(); + startupOut.println("engine listening at port "+port); startupOut.println("operator login set per " + ((aOption.startsWith("@")) ? "file "+aOption : "command-line")); @@ -457,16 +464,16 @@ protected void useAdhocKeystore(PrintStream startupOut) { /** * Create an HTTPS restlet Server instance matching the given parameters. - * + * + * @param component * @param port * @param address * @param keystorePath * @param keystorePassword * @param keyPassword */ - protected void setupServer(int port, String address, String keystorePath, String keystorePassword, String keyPassword) { - Server server = new Server(Protocol.HTTPS,address,port,null); - component.getServers().add(server); + protected void setupServer(Component component, int port, String address, String keystorePath, String keystorePassword, String keyPassword) { + Server server = component.getServers().add(Protocol.HTTPS, address, port); server.getContext().getParameters().add("keystorePath", keystorePath); server.getContext().getParameters().add("keystorePassword", keystorePassword); server.getContext().getParameters().add("keyPassword", keyPassword); diff --git a/engine/src/main/java/org/archive/crawler/restlet/BaseResource.java b/engine/src/main/java/org/archive/crawler/restlet/BaseResource.java index 1ebc80efb..72ae434b2 100644 --- a/engine/src/main/java/org/archive/crawler/restlet/BaseResource.java +++ b/engine/src/main/java/org/archive/crawler/restlet/BaseResource.java @@ -19,55 +19,14 @@ package org.archive.crawler.restlet; -import java.util.List; - -import org.restlet.Context; -import org.restlet.data.MediaType; -import org.restlet.data.Preference; -import org.restlet.data.Request; -import org.restlet.data.Response; -import org.restlet.resource.Resource; -import org.restlet.resource.Variant; +import org.restlet.resource.ServerResource; /** * Abstract {@code Resource} with common shared functionality. * * @author nlevitt */ -public abstract class BaseResource extends Resource { - - public BaseResource(Context ctx, Request req, Response res) { - super(ctx, req, res); - } - - /** - * If client can accept text/html, always prefer it. WebKit-based browsers - * claim to want application/xml, but we don't want to give it to them. See - * https://webarchive.jira.com/browse/HER-1603 - */ - public Variant getPreferredVariant() { - boolean addExplicitTextHtmlPreference = false; - - for (Preference mediaTypePreference: getRequest().getClientInfo().getAcceptedMediaTypes()) { - if (mediaTypePreference.getMetadata().equals(MediaType.TEXT_HTML)) { - mediaTypePreference.setQuality(Float.MAX_VALUE); - addExplicitTextHtmlPreference = false; - break; - } else if (mediaTypePreference.getMetadata().includes(MediaType.TEXT_HTML)) { - addExplicitTextHtmlPreference = true; - } - } - - if (addExplicitTextHtmlPreference) { - List> acceptedMediaTypes = getRequest().getClientInfo().getAcceptedMediaTypes(); - acceptedMediaTypes.add(new Preference(MediaType.TEXT_HTML, Float.MAX_VALUE)); - getRequest().getClientInfo().setAcceptedMediaTypes(acceptedMediaTypes); - } - - - return super.getPreferredVariant(); - } - +public abstract class BaseResource extends ServerResource { protected String getStaticRef(String resource) { String rootRef = getRequest().getRootRef().toString(); return rootRef + "/engine/static/" + resource; diff --git a/engine/src/main/java/org/archive/crawler/restlet/BeanBrowseResource.java b/engine/src/main/java/org/archive/crawler/restlet/BeanBrowseResource.java index b09d70af7..5ae0c8975 100644 --- a/engine/src/main/java/org/archive/crawler/restlet/BeanBrowseResource.java +++ b/engine/src/main/java/org/archive/crawler/restlet/BeanBrowseResource.java @@ -19,16 +19,12 @@ package org.archive.crawler.restlet; -import java.io.File; import java.io.IOException; -import java.io.PrintWriter; import java.io.UnsupportedEncodingException; import java.io.Writer; import java.net.URLDecoder; import java.util.Collection; -import java.util.HashMap; import java.util.HashSet; -import java.util.LinkedHashMap; import java.util.LinkedList; import java.util.Set; @@ -36,18 +32,18 @@ import org.archive.crawler.restlet.models.BeansModel; import org.archive.crawler.restlet.models.ViewModel; import org.archive.spring.PathSharingContext; -import org.archive.util.TextUtils; import org.restlet.Context; import org.restlet.data.CharacterSet; import org.restlet.data.Form; import org.restlet.data.MediaType; import org.restlet.data.Reference; -import org.restlet.data.Request; -import org.restlet.data.Response; -import org.restlet.resource.Representation; +import org.restlet.Request; +import org.restlet.Response; +import org.restlet.representation.EmptyRepresentation; +import org.restlet.representation.Representation; +import org.restlet.representation.WriterRepresentation; import org.restlet.resource.ResourceException; -import org.restlet.resource.Variant; -import org.restlet.resource.WriterRepresentation; +import org.restlet.representation.Variant; import org.springframework.beans.BeanWrapperImpl; import org.springframework.beans.BeansException; @@ -69,12 +65,12 @@ public class BeanBrowseResource extends JobRelatedResource { protected PathSharingContext appCtx; protected String beanPath; private Configuration _templateConfiguration; - - public BeanBrowseResource(Context ctx, Request req, Response res) throws ResourceException { - super(ctx, req, res); + + @Override + public void init(Context ctx, Request req, Response res) throws ResourceException { + super.init(ctx, req, res); getVariants().add(new Variant(MediaType.TEXT_HTML)); getVariants().add(new Variant(MediaType.APPLICATION_XML)); - setModifiable(true); // accept POSTs appCtx = cj.getJobContext(); beanPath = (String)req.getAttributes().get("beanPath"); if (beanPath!=null) { @@ -99,13 +95,14 @@ public Configuration getTemplateConfiguration(){ return _templateConfiguration; } - public void acceptRepresentation(Representation entity) throws ResourceException { + @Override + protected Representation post(Representation entity, Variant variant) throws ResourceException { if (appCtx == null) { throw new ResourceException(404); } // copy op? - Form form = getRequest().getEntityAsForm(); + Form form = new Form(entity); beanPath = form.getFirstValue("beanPath"); String newVal = form.getFirstValue("newVal"); @@ -122,6 +119,7 @@ public void acceptRepresentation(Representation entity) throws ResourceException ref.setPath(getBeansRefPath()); ref.addSegment(beanPath); getResponse().redirectSeeOther(ref); + return new EmptyRepresentation(); } public String getBeansRefPath() { @@ -137,7 +135,8 @@ public String getBeansRefPath() { return path; } - public Representation represent(Variant variant) throws ResourceException { + @Override + public Representation get(Variant variant) throws ResourceException { if (appCtx == null) { throw new ResourceException(404); } @@ -209,8 +208,8 @@ protected BeansModel makeDataModel(){ for(String name: appCtx.getBeanDefinitionNames()) { addPresentableNestedNames(nestedNames, appCtx.getBean(name), alreadyWritten); } - - return new BeansModel(cj.getShortName(), + + return new BeansModel(cj.getShortName(), new Reference(getRequest().getResourceRef().getBaseRef(), "..").getTargetRef().toString(), beanPath, bean, diff --git a/engine/src/main/java/org/archive/crawler/restlet/EditRepresentation.java b/engine/src/main/java/org/archive/crawler/restlet/EditRepresentation.java index a4ea7996b..232dbdd2d 100644 --- a/engine/src/main/java/org/archive/crawler/restlet/EditRepresentation.java +++ b/engine/src/main/java/org/archive/crawler/restlet/EditRepresentation.java @@ -30,8 +30,8 @@ import org.restlet.data.CharacterSet; import org.restlet.data.MediaType; import org.restlet.data.Reference; -import org.restlet.resource.CharacterRepresentation; -import org.restlet.resource.FileRepresentation; +import org.restlet.representation.CharacterRepresentation; +import org.restlet.representation.FileRepresentation; /** * Representation wrapping a FileRepresentation, displaying its contents @@ -40,9 +40,9 @@ * @author gojomo */ public class EditRepresentation extends CharacterRepresentation { - protected FileRepresentation fileRepresentation; + protected FileRepresentation fileRepresentation; protected EnhDirectoryResource dirResource; - + public EditRepresentation(FileRepresentation representation, EnhDirectoryResource resource) { super(MediaType.TEXT_HTML); fileRepresentation = representation; diff --git a/engine/src/main/java/org/archive/crawler/restlet/EngineApplication.java b/engine/src/main/java/org/archive/crawler/restlet/EngineApplication.java index 6c7793f71..a82c30261 100644 --- a/engine/src/main/java/org/archive/crawler/restlet/EngineApplication.java +++ b/engine/src/main/java/org/archive/crawler/restlet/EngineApplication.java @@ -26,19 +26,19 @@ import org.archive.crawler.framework.Engine; import org.archive.util.TextUtils; import org.restlet.Application; -import org.restlet.Directory; -import org.restlet.Redirector; import org.restlet.Restlet; -import org.restlet.Router; import org.restlet.data.MediaType; import org.restlet.data.Reference; -import org.restlet.data.Request; -import org.restlet.data.Response; +import org.restlet.Request; +import org.restlet.Response; import org.restlet.data.Status; -import org.restlet.resource.Representation; -import org.restlet.resource.StringRepresentation; +import org.restlet.representation.Representation; +import org.restlet.representation.StringRepresentation; +import org.restlet.resource.Directory; +import org.restlet.routing.Redirector; +import org.restlet.routing.Router; +import org.restlet.routing.Template; import org.restlet.service.StatusService; -import org.restlet.util.Template; /** * Restlet Application for a Heritrix crawl 'Engine', which is aware of @@ -56,7 +56,8 @@ public EngineApplication(Engine engine) { setStatusService(new EngineStatusService()); } - public synchronized Restlet createRoot() { + @Override + public Restlet createInboundRoot() { Router router = new Router(getContext()); router.attach("/",new Redirector(null,"/engine",Redirector.MODE_CLIENT_TEMPORARY)); diff --git a/engine/src/main/java/org/archive/crawler/restlet/EngineResource.java b/engine/src/main/java/org/archive/crawler/restlet/EngineResource.java index b18d9f3dc..dabd7f3ea 100644 --- a/engine/src/main/java/org/archive/crawler/restlet/EngineResource.java +++ b/engine/src/main/java/org/archive/crawler/restlet/EngineResource.java @@ -30,21 +30,24 @@ import org.archive.crawler.restlet.models.EngineModel; import org.archive.crawler.restlet.models.ViewModel; import org.restlet.Context; +import org.restlet.Request; +import org.restlet.Response; import org.restlet.data.CharacterSet; import org.restlet.data.Form; import org.restlet.data.MediaType; -import org.restlet.data.Request; -import org.restlet.data.Response; -import org.restlet.resource.Representation; +import org.restlet.representation.EmptyRepresentation; +import org.restlet.representation.Representation; +import org.restlet.representation.WriterRepresentation; import org.restlet.resource.ResourceException; -import org.restlet.resource.Variant; -import org.restlet.resource.WriterRepresentation; +import org.restlet.representation.Variant; import freemarker.template.Configuration; import freemarker.template.DefaultObjectWrapper; import freemarker.template.Template; import freemarker.template.TemplateException; +import static org.restlet.data.MediaType.APPLICATION_XML; + /** * Restlet Resource representing an Engine that may be used * to assemble, launch, monitor, and manage crawls. @@ -56,11 +59,12 @@ public class EngineResource extends BaseResource { private Configuration _templateConfiguration; - public EngineResource(Context ctx, Request req, Response res) { - super(ctx, req, res); - setModifiable(true); + + @Override + public void init(Context ctx, Request req, Response res) { + super.init(ctx, req, res); getVariants().add(new Variant(MediaType.TEXT_HTML)); - getVariants().add(new Variant(MediaType.APPLICATION_XML)); + getVariants().add(new Variant(APPLICATION_XML)); Configuration tmpltCfg = new Configuration(); tmpltCfg.setClassForTemplateLoading(this.getClass(),""); @@ -74,10 +78,12 @@ public void setTemplateConfiguration(Configuration tmpltCfg) { public Configuration getTemplateConfiguration(){ return _templateConfiguration; } - public Representation represent(Variant variant) throws ResourceException { + + @Override + protected Representation get(Variant variant) throws ResourceException { Representation representation; - if (variant.getMediaType() == MediaType.APPLICATION_XML) { - representation = new WriterRepresentation(MediaType.APPLICATION_XML) { + if (variant.getMediaType() == APPLICATION_XML) { + representation = new WriterRepresentation(APPLICATION_XML) { public void write(Writer writer) throws IOException { XmlMarshaller.marshalDocument(writer, "engine", makeDataModel()); } @@ -93,10 +99,10 @@ public void write(Writer writer) throws IOException { representation.setCharacterSet(CharacterSet.UTF_8); return representation; } - + @Override - public void acceptRepresentation(Representation entity) throws ResourceException { - Form form = getRequest().getEntityAsForm(); + protected Representation post(Representation entity, Variant variant) throws ResourceException { + Form form = new Form(entity); String action = form.getFirstValue("action"); if("rescan".equals(action)) { getEngine().findJobConfigs(); @@ -182,6 +188,7 @@ public void acceptRepresentation(Representation entity) throws ResourceException } // default: redirect to GET self getResponse().redirectSeeOther(getRequest().getOriginalRef()); + return new EmptyRepresentation(); } diff --git a/engine/src/main/java/org/archive/crawler/restlet/EnhDirectory.java b/engine/src/main/java/org/archive/crawler/restlet/EnhDirectory.java index ea7bb79d6..bb26b16ba 100644 --- a/engine/src/main/java/org/archive/crawler/restlet/EnhDirectory.java +++ b/engine/src/main/java/org/archive/crawler/restlet/EnhDirectory.java @@ -26,10 +26,12 @@ import org.apache.commons.io.filefilter.FileFilterUtils; import org.apache.commons.io.filefilter.IOFileFilter; import org.restlet.Context; -import org.restlet.Handler; import org.restlet.data.Reference; -import org.restlet.data.Request; -import org.restlet.data.Response; +import org.restlet.Request; +import org.restlet.Response; +import org.restlet.data.Status; +import org.restlet.resource.Directory; +import org.restlet.resource.ServerResource; /** * Enhanced version of Restlet Directory, which allows the local @@ -39,11 +41,11 @@ * * @author gojomo */ -public abstract class EnhDirectory extends org.restlet.Directory { +public abstract class EnhDirectory extends Directory { protected IOFileFilter editFilter = FileFilterUtils.falseFileFilter(); protected IOFileFilter pageFilter = FileFilterUtils.falseFileFilter(); protected IOFileFilter tailFilter = FileFilterUtils.falseFileFilter(); - + public EnhDirectory(Context context, Reference rootLocalReference) { super(context, rootLocalReference); // TODO Auto-generated constructor stub @@ -55,21 +57,27 @@ public EnhDirectory(Context context, String rootUri) { } @Override - public Handler findTarget(Request request, Response response) { - Handler retVal; - synchronized(this) { + public void handle(Request request, Response response) { + synchronized (this) { Reference oldRef = getRootRef(); setRootRef(determineRootRef(request)); try { - retVal = new EnhDirectoryResource(this, request, response); - } catch (IOException ioe) { - getLogger().log(Level.WARNING, - "Unable to find the directory's resource", ioe); - retVal = null; + super.handle(request, response); + } finally { + setRootRef(oldRef); + } + + // XXX: FileRepresentation.isAvailable() returns false for empty files generating status 204 No Content + // which confuses browsers. Force it back it 200 OK. + if (response.getStatus() == Status.SUCCESS_NO_CONTENT) { + response.setStatus(Status.SUCCESS_OK); } - setRootRef(oldRef); } - return retVal; + } + + @Override + public ServerResource create(Request request, Response response) { + return new EnhDirectoryResource(); } protected abstract Reference determineRootRef(Request request); diff --git a/engine/src/main/java/org/archive/crawler/restlet/EnhDirectoryResource.java b/engine/src/main/java/org/archive/crawler/restlet/EnhDirectoryResource.java index 5f7c0b5e0..18c593543 100644 --- a/engine/src/main/java/org/archive/crawler/restlet/EnhDirectoryResource.java +++ b/engine/src/main/java/org/archive/crawler/restlet/EnhDirectoryResource.java @@ -23,22 +23,18 @@ import java.io.File; import java.io.IOException; import java.net.URI; +import java.util.LinkedList; import java.util.List; import java.util.ListIterator; import org.apache.commons.io.FileUtils; -import org.restlet.data.CharacterSet; -import org.restlet.data.Form; -import org.restlet.data.Reference; -import org.restlet.data.Request; -import org.restlet.data.Response; -import org.restlet.data.Status; -import org.restlet.resource.FileRepresentation; -import org.restlet.resource.Representation; +import org.restlet.data.*; +import org.restlet.engine.local.DirectoryServerResource; +import org.restlet.representation.EmptyRepresentation; +import org.restlet.representation.FileRepresentation; +import org.restlet.representation.Representation; +import org.restlet.representation.Variant; import org.restlet.resource.ResourceException; -import org.restlet.resource.Variant; - -import com.noelios.restlet.local.DirectoryResource; /** * Enhanced version of Restlet DirectoryResource, adding ability to @@ -46,20 +42,15 @@ * * @author gojomo */ -public class EnhDirectoryResource extends DirectoryResource { - - public EnhDirectoryResource(EnhDirectory directory, Request request, Response response) throws IOException { - super(directory, request, response); - } - - /** +public class EnhDirectoryResource extends DirectoryServerResource { + /** * Add EditRepresentation as a variant when appropriate. * - * @see com.noelios.restlet.local.DirectoryResource#getVariants() + * @see org.restlet.engine.local.DirectoryServerResource#getVariants() */ @Override public List getVariants() { - List variants = super.getVariants(); + List variants = new LinkedList<>(super.getVariants(Method.GET)); Form f = getRequest().getResourceRef().getQueryAsForm(); String format = f.getFirstValue("format"); if("textedit".equals(format)) { @@ -73,7 +64,7 @@ public List getVariants() { } catch (Exception e) { throw new RuntimeException(e); } - variants = super.getVariants(); + variants = new LinkedList<>(super.getVariants(Method.GET)); } // wrap FileRepresentations in EditRepresentations ListIterator iter = variants.listIterator(); @@ -119,18 +110,18 @@ public List getVariants() { } protected EnhDirectory getEnhDirectory() { - return (EnhDirectory)getDirectory(); + return (EnhDirectory) getDirectory(); } - /** + /** * Accept a POST used to edit or create a file. * - * @see org.restlet.resource.Resource#acceptRepresentation(org.restlet.resource.Representation) + * @see org.restlet.resource.ServerResource#post(Representation, Variant) */ - public void acceptRepresentation(Representation entity) - throws ResourceException { + @Override + protected Representation post(Representation entity, Variant variant) throws ResourceException { // TODO: only allowPost on valid targets - Form form = getRequest().getEntityAsForm(); + Form form = new Form(entity); String newContents = form.getFirstValue("contents"); EditRepresentation er; try { @@ -152,6 +143,6 @@ public void acceptRepresentation(Representation entity) Reference ref = getRequest().getOriginalRef().clone(); /// ref.setQuery(null); getResponse().redirectSeeOther(ref); - + return new EmptyRepresentation(); } } diff --git a/engine/src/main/java/org/archive/crawler/restlet/Flash.java b/engine/src/main/java/org/archive/crawler/restlet/Flash.java index a43d6c9ec..441408663 100644 --- a/engine/src/main/java/org/archive/crawler/restlet/Flash.java +++ b/engine/src/main/java/org/archive/crawler/restlet/Flash.java @@ -29,10 +29,10 @@ import org.apache.commons.lang.StringUtils; import org.apache.commons.lang.math.RandomUtils; +import org.restlet.Request; +import org.restlet.Response; import org.restlet.data.Cookie; import org.restlet.data.CookieSetting; -import org.restlet.data.Request; -import org.restlet.data.Response; import org.restlet.util.Series; /** diff --git a/engine/src/main/java/org/archive/crawler/restlet/JobRelatedResource.java b/engine/src/main/java/org/archive/crawler/restlet/JobRelatedResource.java index ae967a0a9..4dfe548c7 100644 --- a/engine/src/main/java/org/archive/crawler/restlet/JobRelatedResource.java +++ b/engine/src/main/java/org/archive/crawler/restlet/JobRelatedResource.java @@ -40,8 +40,8 @@ import org.archive.util.TextUtils; import org.restlet.Context; import org.restlet.data.Reference; -import org.restlet.data.Request; -import org.restlet.data.Response; +import org.restlet.Request; +import org.restlet.Response; import org.restlet.resource.ResourceException; import org.springframework.beans.BeanUtils; import org.springframework.beans.BeanWrapperImpl; @@ -58,18 +58,19 @@ public abstract class JobRelatedResource extends BaseResource { private final static Logger LOGGER = Logger.getLogger(JobRelatedResource.class.getName()); - protected CrawlJob cj; + protected CrawlJob cj; protected IdentityHashMap beanToNameMap; - - public JobRelatedResource(Context ctx, Request req, Response res) throws ResourceException { - super(ctx, req, res); + + @Override + public void init(Context ctx, Request req, Response res) { + super.init(ctx, req, res); cj = getEngine().getJob((String)req.getAttributes().get("job")); - if(cj==null) { + if(cj == null) { throw new ResourceException(404); } } - + protected Engine getEngine() { return ((EngineApplication)getApplication()).getEngine(); } @@ -95,13 +96,13 @@ protected void addPresentableNestedNames(Collection namedBeans, Object o return; } - - Reference baseRef = getRequest().getResourceRef().getBaseRef(); + + Reference baseRef = getRequest().getResourceRef().getBaseRef(); if (baseRef.getPath().endsWith("beans")) { baseRef.setPath(baseRef.getPath() + "/"); } - + if (getBeanToNameMap().containsKey(obj)) { // this object is itself a named bean Map bean = new LinkedHashMap(); diff --git a/engine/src/main/java/org/archive/crawler/restlet/JobResource.java b/engine/src/main/java/org/archive/crawler/restlet/JobResource.java index d3b0a295c..5b4ab6c74 100644 --- a/engine/src/main/java/org/archive/crawler/restlet/JobResource.java +++ b/engine/src/main/java/org/archive/crawler/restlet/JobResource.java @@ -38,12 +38,13 @@ import org.restlet.data.CharacterSet; import org.restlet.data.Form; import org.restlet.data.MediaType; -import org.restlet.data.Request; -import org.restlet.data.Response; -import org.restlet.resource.Representation; +import org.restlet.Request; +import org.restlet.Response; +import org.restlet.representation.EmptyRepresentation; +import org.restlet.representation.Representation; +import org.restlet.representation.WriterRepresentation; import org.restlet.resource.ResourceException; -import org.restlet.resource.Variant; -import org.restlet.resource.WriterRepresentation; +import org.restlet.representation.Variant; import freemarker.template.Configuration; import freemarker.template.ObjectWrapper; @@ -68,10 +69,10 @@ public class JobResource extends BaseResource { protected CrawlJob cj; - public JobResource(Context ctx, Request req, Response res) + @Override + public void init(Context ctx, Request req, Response res) throws ResourceException { - super(ctx, req, res); - setModifiable(true); + super.init(ctx, req, res); getVariants().add(new Variant(MediaType.TEXT_HTML)); getVariants().add(new Variant(MediaType.APPLICATION_XML)); cj = getEngine().getJob( @@ -88,7 +89,9 @@ public void setTemplateConfiguration(Configuration tmpltCfg) { public Configuration getTemplateConfiguration(){ return _templateConfiguration; } - public Representation represent(Variant variant) throws ResourceException { + + @Override + public Representation get(Variant variant) throws ResourceException { if (cj == null) { throw new ResourceException(404); } @@ -184,19 +187,18 @@ protected Engine getEngine() { } @Override - public void acceptRepresentation(Representation entity) + public Representation post(Representation entity, Variant variant) throws ResourceException { if (cj == null) { throw new ResourceException(404); } // copy op? - Form form = null; - form = getRequest().getEntityAsForm(); + Form form = new Form(entity); String copyTo = form.getFirstValue("copyTo"); if (copyTo != null) { copyJob(copyTo, "on".equals(form.getFirstValue("asProfile"))); - return; + return new EmptyRepresentation(); } AlertHandler.ensureStaticInitialization(); AlertThreadGroup.setThreadLogger(cj.getJobLogger()); @@ -241,6 +243,7 @@ public void acceptRepresentation(Representation entity) // default: redirect to GET self getResponse().redirectSeeOther(getRequest().getOriginalRef()); + return new EmptyRepresentation(); } protected void copyJob(String copyTo, boolean asProfile) diff --git a/engine/src/main/java/org/archive/crawler/restlet/PagedRepresentation.java b/engine/src/main/java/org/archive/crawler/restlet/PagedRepresentation.java index 363ee674e..aabd0d468 100644 --- a/engine/src/main/java/org/archive/crawler/restlet/PagedRepresentation.java +++ b/engine/src/main/java/org/archive/crawler/restlet/PagedRepresentation.java @@ -38,8 +38,8 @@ import org.restlet.data.Form; import org.restlet.data.MediaType; import org.restlet.data.Reference; -import org.restlet.resource.CharacterRepresentation; -import org.restlet.resource.FileRepresentation; +import org.restlet.representation.CharacterRepresentation; +import org.restlet.representation.FileRepresentation; /** * Representation wrapping a FileRepresentation, displaying its contents @@ -50,7 +50,7 @@ public class PagedRepresentation extends CharacterRepresentation { // passed-in at construction /** wrapped FileRepresentation **/ - protected FileRepresentation fileRepresentation; + protected FileRepresentation fileRepresentation; /** wrapped EnhDirectoryResource; used to formulate self-links **/ protected EnhDirectoryResource dirResource; @@ -124,7 +124,7 @@ protected void loadLines() throws IOException { /** * Write the paged HTML. * - * @see org.restlet.resource.Representation#write(java.io.Writer) + * @see org.restlet.representation.Representation#write(java.io.Writer) */ @Override public void write(Writer writer) throws IOException { diff --git a/engine/src/main/java/org/archive/crawler/restlet/RateLimitGuard.java b/engine/src/main/java/org/archive/crawler/restlet/RateLimitGuard.java index d4d304ca7..042fd2ea6 100644 --- a/engine/src/main/java/org/archive/crawler/restlet/RateLimitGuard.java +++ b/engine/src/main/java/org/archive/crawler/restlet/RateLimitGuard.java @@ -18,21 +18,21 @@ */ package org.archive.crawler.restlet; -import java.util.Collection; import java.util.logging.Logger; import org.restlet.Context; -import org.restlet.Guard; +import org.restlet.Request; +import org.restlet.Response; import org.restlet.data.ChallengeScheme; -import org.restlet.data.Request; +import org.restlet.security.ChallengeAuthenticator; /** - * Guard that slows and logs failed authentication attempts, to make + * ChallengeAuthenticator that slows and logs failed authentication attempts, to make * brute-force guessing attacks less feasible. * * @author gojomo */ -public class RateLimitGuard extends Guard { +public class RateLimitGuard extends ChallengeAuthenticator { private static final int MIN_MS_BETWEEN_ATTEMPTS = 6000; private static final Logger logger = Logger.getLogger(RateLimitGuard.class.getName()); @@ -43,14 +43,10 @@ public RateLimitGuard(Context context, ChallengeScheme scheme, String realm) thr super(context, scheme, realm); } - public RateLimitGuard(Context context, String realm, Collection baseUris, String serverKey) { - super(context, realm, baseUris, serverKey); - } - @Override - public synchronized int authenticate(Request request) { - int retVal = super.authenticate(request); - if(retVal == AUTHENTICATION_INVALID) { + protected boolean authenticate(Request request, Response response) { + boolean succeeded = super.authenticate(request, response); + if (!succeeded) { logger.warning("authentication failure "+request); // wait until at least LAG has passed from last failure // holding object lock the whole time, so no other checks @@ -64,8 +60,8 @@ public synchronized int authenticate(Request request) { // ignore } } - lastFailureTime = now + sleepMs; + lastFailureTime = now + sleepMs; } - return retVal; + return succeeded; } } diff --git a/engine/src/main/java/org/archive/crawler/restlet/ReportGenResource.java b/engine/src/main/java/org/archive/crawler/restlet/ReportGenResource.java index 3ccc87945..5468bf553 100644 --- a/engine/src/main/java/org/archive/crawler/restlet/ReportGenResource.java +++ b/engine/src/main/java/org/archive/crawler/restlet/ReportGenResource.java @@ -23,12 +23,12 @@ import org.restlet.Context; import org.restlet.data.MediaType; -import org.restlet.data.Request; -import org.restlet.data.Response; -import org.restlet.resource.Representation; +import org.restlet.Request; +import org.restlet.Response; +import org.restlet.representation.Representation; +import org.restlet.representation.StringRepresentation; import org.restlet.resource.ResourceException; -import org.restlet.resource.StringRepresentation; -import org.restlet.resource.Variant; +import org.restlet.representation.Variant; /** * Restlet Resource which generates fresh reports and then redirects @@ -38,24 +38,26 @@ */ public class ReportGenResource extends JobRelatedResource { protected String reportClass; - - public ReportGenResource(Context ctx, Request req, Response res) throws ResourceException { - super(ctx, req, res); + + @Override + public void init(Context ctx, Request req, Response res) throws ResourceException { + super.init(ctx, req, res); getVariants().add(new Variant(MediaType.TEXT_PLAIN)); reportClass = (String)req.getAttributes().get("reportClass"); } - public Representation represent(Variant variant) throws ResourceException { + @Override + protected Representation get(Variant variant) throws ResourceException { // generate report if (cj == null || cj.getCrawlController() == null) { throw new ResourceException(500); } - File f = cj.getCrawlController().getStatisticsTracker().writeReportFile(reportClass); + File f = cj.getCrawlController().getStatisticsTracker().writeReportFile(reportClass); if (f==null) { throw new ResourceException(500); } // redirect - String relative = JobResource.getHrefPath(f,cj); + String relative = JobResource.getHrefPath(f, cj); if(relative!=null) { getResponse().redirectSeeOther("../"+relative+"?m="+f.lastModified()); return new StringRepresentation(""); diff --git a/engine/src/main/java/org/archive/crawler/restlet/ScriptResource.java b/engine/src/main/java/org/archive/crawler/restlet/ScriptResource.java index 5a8bfa6bc..58fee0f91 100644 --- a/engine/src/main/java/org/archive/crawler/restlet/ScriptResource.java +++ b/engine/src/main/java/org/archive/crawler/restlet/ScriptResource.java @@ -41,12 +41,12 @@ import org.restlet.data.Form; import org.restlet.data.MediaType; import org.restlet.data.Reference; -import org.restlet.data.Request; -import org.restlet.data.Response; -import org.restlet.resource.Representation; +import org.restlet.Request; +import org.restlet.Response; +import org.restlet.representation.Representation; +import org.restlet.representation.WriterRepresentation; import org.restlet.resource.ResourceException; -import org.restlet.resource.Variant; -import org.restlet.resource.WriterRepresentation; +import org.restlet.representation.Variant; import freemarker.template.Configuration; import freemarker.template.ObjectWrapper; @@ -81,10 +81,10 @@ public int compare(ScriptEngineFactory sef1, ScriptEngineFactory sef2) { protected String chosenEngine = FACTORIES.isEmpty() ? "" : FACTORIES.getFirst().getNames().get(0); private Configuration _templateConfiguration; - - public ScriptResource(Context ctx, Request req, Response res) throws ResourceException { - super(ctx, req, res); - setModifiable(true); + + @Override + public void init(Context ctx, Request req, Response res) throws ResourceException { + super.init(ctx, req, res); getVariants().add(new Variant(MediaType.TEXT_HTML)); getVariants().add(new Variant(MediaType.APPLICATION_XML)); @@ -92,7 +92,7 @@ public ScriptResource(Context ctx, Request req, Response res) throws ResourceExc tmpltCfg.setClassForTemplateLoading(this.getClass(),""); tmpltCfg.setObjectWrapper(ObjectWrapper.BEANS_WRAPPER); setTemplateConfiguration(tmpltCfg); - + scriptingConsole = new ScriptingConsole(cj); } public void setTemplateConfiguration(Configuration tmpltCfg) { @@ -105,8 +105,8 @@ public Configuration getTemplateConfiguration(){ private ScriptingConsole scriptingConsole; @Override - public void acceptRepresentation(Representation entity) throws ResourceException { - Form form = getRequest().getEntityAsForm(); + public Representation post(Representation entity, Variant variant) throws ResourceException { + Form form = new Form(entity); chosenEngine = form.getFirstValue("engine"); String script = form.getFirstValue("script"); if(StringUtils.isBlank(script)) { @@ -121,10 +121,11 @@ public void acceptRepresentation(Representation entity) throws ResourceException //TODO: log script, results somewhere; job log INFO? - getResponse().setEntity(represent()); + return get(variant); } - - public Representation represent(Variant variant) throws ResourceException { + + @Override + public Representation get(Variant variant) throws ResourceException { Representation representation; if (variant.getMediaType() == MediaType.APPLICATION_XML) { representation = new WriterRepresentation(MediaType.APPLICATION_XML) { diff --git a/engine/src/main/java/org/archive/crawler/restlet/XmlMarshaller.java b/engine/src/main/java/org/archive/crawler/restlet/XmlMarshaller.java index 11098b382..176204c67 100644 --- a/engine/src/main/java/org/archive/crawler/restlet/XmlMarshaller.java +++ b/engine/src/main/java/org/archive/crawler/restlet/XmlMarshaller.java @@ -36,7 +36,7 @@ import javax.xml.bind.annotation.XmlType; import org.apache.commons.lang.StringUtils; -import org.restlet.util.XmlWriter; +import org.restlet.ext.xml.XmlWriter; import org.xml.sax.SAXException; /** diff --git a/engine/src/test/java/org/archive/crawler/selftest/CheckpointSelfTest.java b/engine/src/test/java/org/archive/crawler/selftest/CheckpointSelfTest.java index 138f47a10..7f7d9e6be 100644 --- a/engine/src/test/java/org/archive/crawler/selftest/CheckpointSelfTest.java +++ b/engine/src/test/java/org/archive/crawler/selftest/CheckpointSelfTest.java @@ -22,10 +22,10 @@ import java.io.IOException; import org.archive.crawler.framework.CrawlJob; -import org.mortbay.jetty.Server; -import org.mortbay.jetty.bio.SocketConnector; -import org.mortbay.jetty.servlet.ServletHandler; -import org.mortbay.jetty.servlet.ServletHolder; +import org.eclipse.jetty.server.Server; +import org.eclipse.jetty.server.ServerConnector; +import org.eclipse.jetty.servlet.ServletHandler; +import org.eclipse.jetty.servlet.ServletHolder; /** @@ -85,7 +85,7 @@ protected void startHttpServer() throws Exception { private Server makeHttpServer(int port) throws Exception { Server server = new Server(); - SocketConnector sc = new SocketConnector(); + ServerConnector sc = new ServerConnector(server); sc.setHost(HOST); sc.setPort(port); server.addConnector(sc); diff --git a/engine/src/test/java/org/archive/crawler/selftest/FormAuthSelfTest.java b/engine/src/test/java/org/archive/crawler/selftest/FormAuthSelfTest.java index 63d4b764b..6e04f56a9 100644 --- a/engine/src/test/java/org/archive/crawler/selftest/FormAuthSelfTest.java +++ b/engine/src/test/java/org/archive/crawler/selftest/FormAuthSelfTest.java @@ -19,19 +19,20 @@ package org.archive.crawler.selftest; +import org.eclipse.jetty.server.Handler; +import org.eclipse.jetty.server.Server; +import org.eclipse.jetty.server.ServerConnector; +import org.eclipse.jetty.server.handler.DefaultHandler; +import org.eclipse.jetty.server.handler.HandlerList; +import org.eclipse.jetty.server.handler.ResourceHandler; +import org.eclipse.jetty.servlet.ServletHandler; +import org.eclipse.jetty.servlet.ServletHolder; + import java.util.Arrays; import java.util.Collections; import java.util.HashSet; import java.util.Set; -import org.mortbay.jetty.Handler; -import org.mortbay.jetty.Server; -import org.mortbay.jetty.bio.SocketConnector; -import org.mortbay.jetty.handler.DefaultHandler; -import org.mortbay.jetty.handler.HandlerList; -import org.mortbay.jetty.handler.ResourceHandler; -import org.mortbay.jetty.servlet.ServletHandler; -import org.mortbay.jetty.servlet.ServletHolder; /** * Test form-based authentication @@ -60,17 +61,17 @@ protected void verify() throws Exception { protected void startHttpServer() throws Exception { Server server = new Server(); - SocketConnector sc = new SocketConnector(); + ServerConnector sc = new ServerConnector(server); sc.setHost("127.0.0.1"); sc.setPort(7777); server.addConnector(sc); ResourceHandler rhandler = new ResourceHandler(); rhandler.setResourceBase(getSrcHtdocs().getAbsolutePath()); - ServletHandler servletHandler = new ServletHandler(); + ServletHandler servletHandler = new ServletHandler(); HandlerList handlers = new HandlerList(); - handlers.setHandlers(new Handler[] { + handlers.setHandlers(new Handler[] { rhandler, servletHandler, new DefaultHandler() }); diff --git a/engine/src/test/java/org/archive/crawler/selftest/FormLoginSelfTest.java b/engine/src/test/java/org/archive/crawler/selftest/FormLoginSelfTest.java index 510fb8395..332f6835d 100644 --- a/engine/src/test/java/org/archive/crawler/selftest/FormLoginSelfTest.java +++ b/engine/src/test/java/org/archive/crawler/selftest/FormLoginSelfTest.java @@ -19,20 +19,20 @@ package org.archive.crawler.selftest; +import org.eclipse.jetty.server.Handler; +import org.eclipse.jetty.server.Server; +import org.eclipse.jetty.server.ServerConnector; +import org.eclipse.jetty.server.handler.DefaultHandler; +import org.eclipse.jetty.server.handler.HandlerList; +import org.eclipse.jetty.server.handler.ResourceHandler; +import org.eclipse.jetty.servlet.ServletHandler; +import org.eclipse.jetty.servlet.ServletHolder; + import java.util.Arrays; import java.util.Collections; import java.util.HashSet; import java.util.Set; -import org.mortbay.jetty.Handler; -import org.mortbay.jetty.Server; -import org.mortbay.jetty.bio.SocketConnector; -import org.mortbay.jetty.handler.DefaultHandler; -import org.mortbay.jetty.handler.HandlerList; -import org.mortbay.jetty.handler.ResourceHandler; -import org.mortbay.jetty.servlet.ServletHandler; -import org.mortbay.jetty.servlet.ServletHolder; - /** * Test form-based authentication * @@ -60,17 +60,17 @@ protected void verify() throws Exception { protected void startHttpServer() throws Exception { Server server = new Server(); - SocketConnector sc = new SocketConnector(); + ServerConnector sc = new ServerConnector(server); sc.setHost("127.0.0.1"); sc.setPort(7777); server.addConnector(sc); ResourceHandler rhandler = new ResourceHandler(); rhandler.setResourceBase(getSrcHtdocs().getAbsolutePath()); - ServletHandler servletHandler = new ServletHandler(); + ServletHandler servletHandler = new ServletHandler(); HandlerList handlers = new HandlerList(); - handlers.setHandlers(new Handler[] { + handlers.setHandlers(new Handler[] { rhandler, servletHandler, new DefaultHandler() }); diff --git a/engine/src/test/java/org/archive/crawler/selftest/HttpAuthSelfTest.java b/engine/src/test/java/org/archive/crawler/selftest/HttpAuthSelfTest.java index a0f68464f..6ad5ebd29 100644 --- a/engine/src/test/java/org/archive/crawler/selftest/HttpAuthSelfTest.java +++ b/engine/src/test/java/org/archive/crawler/selftest/HttpAuthSelfTest.java @@ -24,17 +24,16 @@ import java.util.HashSet; import java.util.Set; -import org.mortbay.jetty.Handler; -import org.mortbay.jetty.Server; -import org.mortbay.jetty.bio.SocketConnector; -import org.mortbay.jetty.handler.DefaultHandler; -import org.mortbay.jetty.handler.HandlerList; -import org.mortbay.jetty.handler.ResourceHandler; -import org.mortbay.jetty.security.Constraint; -import org.mortbay.jetty.security.ConstraintMapping; -import org.mortbay.jetty.security.HashUserRealm; -import org.mortbay.jetty.security.SecurityHandler; -import org.mortbay.jetty.servlet.ServletHandler; +import org.eclipse.jetty.security.*; +import org.eclipse.jetty.server.Handler; +import org.eclipse.jetty.server.Server; +import org.eclipse.jetty.server.ServerConnector; +import org.eclipse.jetty.server.handler.DefaultHandler; +import org.eclipse.jetty.server.handler.HandlerList; +import org.eclipse.jetty.server.handler.ResourceHandler; +import org.eclipse.jetty.servlet.ServletHandler; +import org.eclipse.jetty.util.security.Constraint; +import org.eclipse.jetty.util.security.Password; /** * Test HTTP basic authentication @@ -73,27 +72,27 @@ protected void startHttpServer() throws Exception { ConstraintMapping cm = new ConstraintMapping(); cm.setConstraint(constraint); cm.setPathSpec("/basic/*"); + + UserStore userStore = new UserStore(); + userStore.addUser("Mr. Happy Pants", new Password("xyzzy"), new String[]{"rule"}); + HashLoginService loginService = new HashLoginService("Hyrule"); + loginService.setUserStore(userStore); - HashUserRealm realm = new HashUserRealm(); - realm.setName("Hyrule"); - realm.put("Mr. Happy Pants", "xyzzy"); - realm.addUserToRole("Mr. Happy Pants", "user"); - - SecurityHandler securityHandler = new SecurityHandler(); - securityHandler.setUserRealm(realm); + ConstraintSecurityHandler securityHandler = new ConstraintSecurityHandler(); + securityHandler.setLoginService(loginService); securityHandler.setConstraintMappings(new ConstraintMapping[]{cm}); - SocketConnector sc = new SocketConnector(); + ServerConnector sc = new ServerConnector(server); sc.setHost("127.0.0.1"); sc.setPort(7777); server.addConnector(sc); ResourceHandler rhandler = new ResourceHandler(); rhandler.setResourceBase(getSrcHtdocs().getAbsolutePath()); - ServletHandler servletHandler = new ServletHandler(); + ServletHandler servletHandler = new ServletHandler(); HandlerList handlers = new HandlerList(); - handlers.setHandlers(new Handler[] { + handlers.setHandlers(new Handler[] { securityHandler, rhandler, servletHandler, diff --git a/engine/src/test/java/org/archive/crawler/selftest/SelfTestBase.java b/engine/src/test/java/org/archive/crawler/selftest/SelfTestBase.java index 0771360c0..7f09dfbe6 100644 --- a/engine/src/test/java/org/archive/crawler/selftest/SelfTestBase.java +++ b/engine/src/test/java/org/archive/crawler/selftest/SelfTestBase.java @@ -38,12 +38,12 @@ import org.archive.net.UURI; import org.archive.net.UURIFactory; import org.archive.util.TmpDirTestCase; -import org.mortbay.jetty.Handler; -import org.mortbay.jetty.Server; -import org.mortbay.jetty.bio.SocketConnector; -import org.mortbay.jetty.handler.DefaultHandler; -import org.mortbay.jetty.handler.HandlerList; -import org.mortbay.jetty.handler.ResourceHandler; +import org.eclipse.jetty.server.Handler; +import org.eclipse.jetty.server.Server; +import org.eclipse.jetty.server.ServerConnector; +import org.eclipse.jetty.server.handler.DefaultHandler; +import org.eclipse.jetty.server.handler.HandlerList; +import org.eclipse.jetty.server.handler.ResourceHandler; /** * Base class for 'self tests', integrations tests formatted as unit @@ -182,7 +182,7 @@ protected void stopHttpServer() throws Exception { protected void startHttpServer() throws Exception { Server server = new Server(); - SocketConnector sc = new SocketConnector(); + ServerConnector sc = new ServerConnector(server); sc.setHost("127.0.0.1"); sc.setPort(7777); server.addConnector(sc); diff --git a/engine/src/test/java/org/archive/crawler/selftest/UserAgentSelfTest.java b/engine/src/test/java/org/archive/crawler/selftest/UserAgentSelfTest.java index 489253456..d70ddc0bb 100644 --- a/engine/src/test/java/org/archive/crawler/selftest/UserAgentSelfTest.java +++ b/engine/src/test/java/org/archive/crawler/selftest/UserAgentSelfTest.java @@ -20,14 +20,14 @@ package org.archive.crawler.selftest; import org.archive.util.ArchiveUtils; -import org.mortbay.jetty.Handler; -import org.mortbay.jetty.Server; -import org.mortbay.jetty.bio.SocketConnector; -import org.mortbay.jetty.handler.DefaultHandler; -import org.mortbay.jetty.handler.HandlerList; -import org.mortbay.jetty.handler.ResourceHandler; -import org.mortbay.jetty.servlet.ServletHandler; -import org.mortbay.jetty.servlet.ServletHolder; +import org.eclipse.jetty.server.Handler; +import org.eclipse.jetty.server.Server; +import org.eclipse.jetty.server.ServerConnector; +import org.eclipse.jetty.server.handler.DefaultHandler; +import org.eclipse.jetty.server.handler.HandlerList; +import org.eclipse.jetty.server.handler.ResourceHandler; +import org.eclipse.jetty.servlet.ServletHandler; +import org.eclipse.jetty.servlet.ServletHolder; /** * @author pjack @@ -54,17 +54,17 @@ protected void verify() throws Exception { protected void startHttpServer() throws Exception { Server server = new Server(); - SocketConnector sc = new SocketConnector(); + ServerConnector sc = new ServerConnector(server); sc.setHost("127.0.0.1"); sc.setPort(7777); server.addConnector(sc); ResourceHandler rhandler = new ResourceHandler(); rhandler.setResourceBase(getSrcHtdocs().getAbsolutePath()); - ServletHandler servletHandler = new ServletHandler(); + ServletHandler servletHandler = new ServletHandler(); HandlerList handlers = new HandlerList(); - handlers.setHandlers(new Handler[] { + handlers.setHandlers(new Handler[] { rhandler, servletHandler, new DefaultHandler() }); diff --git a/engine/src/test/java/org/archive/modules/fetcher/FormAuthTest.java b/engine/src/test/java/org/archive/modules/fetcher/FormAuthTest.java index 7d3ce5761..0677cbc58 100644 --- a/engine/src/test/java/org/archive/modules/fetcher/FormAuthTest.java +++ b/engine/src/test/java/org/archive/modules/fetcher/FormAuthTest.java @@ -44,20 +44,19 @@ import org.archive.net.UURIFactory; import org.archive.util.Recorder; import org.archive.util.TmpDirTestCase; -import org.mortbay.jetty.NCSARequestLog; -import org.mortbay.jetty.Request; -import org.mortbay.jetty.Server; -import org.mortbay.jetty.bio.SocketConnector; -import org.mortbay.jetty.handler.HandlerCollection; -import org.mortbay.jetty.handler.RequestLogHandler; -import org.mortbay.jetty.security.Authenticator; -import org.mortbay.jetty.security.Constraint; -import org.mortbay.jetty.security.ConstraintMapping; -import org.mortbay.jetty.security.FormAuthenticator; -import org.mortbay.jetty.security.HashUserRealm; -import org.mortbay.jetty.security.SecurityHandler; -import org.mortbay.jetty.servlet.HashSessionManager; -import org.mortbay.jetty.servlet.SessionHandler; +import org.eclipse.jetty.security.*; +import org.eclipse.jetty.security.authentication.FormAuthenticator; +import org.eclipse.jetty.server.NCSARequestLog; +import org.eclipse.jetty.server.Request; +import org.eclipse.jetty.server.Server; +import org.eclipse.jetty.server.ServerConnector; +import org.eclipse.jetty.server.handler.HandlerCollection; +import org.eclipse.jetty.server.handler.RequestLogHandler; +import org.eclipse.jetty.server.session.DefaultSessionCache; +import org.eclipse.jetty.server.session.SessionCache; +import org.eclipse.jetty.server.session.SessionHandler; +import org.eclipse.jetty.util.security.Constraint; +import org.eclipse.jetty.util.security.Password; /* Somewhat redundant to org.archive.crawler.selftest.FormAuthSelfTest, but * the code is written, it's easier to run in eclipse, and no doubt tests @@ -211,12 +210,9 @@ protected static class FormAuthTestHandler extends SessionHandler { public FormAuthTestHandler() { super(); } - + @Override - public void handle(String target, HttpServletRequest request, - HttpServletResponse response, int dispatch) throws IOException, - ServletException { - + public void doHandle(String target, Request baseRequest, HttpServletRequest request, HttpServletResponse response) throws IOException, ServletException { if (target.endsWith("/set-cookie")) { response.addCookie(new javax.servlet.http.Cookie("test-cookie-name", "test-cookie-value")); } @@ -247,16 +243,16 @@ protected static SecurityHandler makeAuthWrapper(Authenticator authenticator, constraintMapping.setConstraint(constraint); constraintMapping.setPathSpec("/auth/*"); - SecurityHandler authWrapper = new SecurityHandler(); + UserStore userStore = new UserStore(); + userStore.addUser(login, new Password(password), new String[]{role}); + + HashLoginService loginService = new HashLoginService(realm); + loginService.setUserStore(userStore); + + ConstraintSecurityHandler authWrapper = new ConstraintSecurityHandler(); authWrapper.setAuthenticator(authenticator); - authWrapper.setConstraintMappings(new ConstraintMapping[] {constraintMapping}); - authWrapper.setUserRealm(new HashUserRealm(realm) { - { - put(login, password); - addUserToRole(login, role); - } - }); + authWrapper.setLoginService(loginService); return authWrapper; } @@ -265,7 +261,7 @@ protected void startHttpServers() throws Exception { // server for form auth Server server = new Server(); - SocketConnector sc = new SocketConnector(); + ServerConnector sc = new ServerConnector(server); sc.setHost("127.0.0.1"); sc.setPort(7779); server.addConnector(sc); @@ -277,16 +273,16 @@ protected void startHttpServers() throws Exception { requestLogHandler.setRequestLog(requestLog); handlers.addHandler(requestLogHandler); - FormAuthenticator formAuthenticatrix = new FormAuthenticator(); - formAuthenticatrix.setLoginPage("/login.html"); - + FormAuthenticator formAuthenticatrix = new FormAuthenticator("/login.html", null, false); + SecurityHandler authWrapper = makeAuthWrapper(formAuthenticatrix, FORM_AUTH_ROLE, FORM_AUTH_REALM, FORM_AUTH_LOGIN, FORM_AUTH_PASSWORD); authWrapper.setHandler(handlers); SessionHandler sessionHandler = new SessionHandler(); - sessionHandler.setSessionManager(new HashSessionManager()); + SessionCache cache = new DefaultSessionCache(sessionHandler); + sessionHandler.setSessionCache(cache); sessionHandler.setHandler(authWrapper); server.setHandler(sessionHandler); diff --git a/engine/src/test/resources/log4j.xml b/engine/src/test/resources/log4j.xml index 475b130e2..e04f34ab8 100644 --- a/engine/src/test/resources/log4j.xml +++ b/engine/src/test/resources/log4j.xml @@ -11,7 +11,7 @@ - + diff --git a/modules/src/test/resources/log4j.xml b/modules/src/test/resources/log4j.xml index 475b130e2..e04f34ab8 100644 --- a/modules/src/test/resources/log4j.xml +++ b/modules/src/test/resources/log4j.xml @@ -11,7 +11,7 @@ - + From 996d1366b096a6f72f27843ac98ef7130988c770 Mon Sep 17 00:00:00 2001 From: Alex Osborne Date: Tue, 6 Aug 2019 16:41:09 +0900 Subject: [PATCH 022/123] Drop support for JDK 7 Jetty 9.4 requires JDK 8 or later. JDK 11 requires Jetty 9.4.12+. Therefore unfortunately we cannot easily support both JDK 11 and JDK 7 at the same time. --- .travis.yml | 7 ++----- pom.xml | 4 ++-- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/.travis.yml b/.travis.yml index 20978aa4d..c3f0a33a6 100644 --- a/.travis.yml +++ b/.travis.yml @@ -6,9 +6,6 @@ matrix: include: - jdk: oraclejdk8 dist: trusty - - jdk: openjdk7 - dist: trusty - env: PROJECTS='--projects commons,modules,engine' - jdk: openjdk8 - jdk: openjdk11 allow_failures: @@ -22,14 +19,14 @@ before_install: - "export _JAVA_OPTIONS=-Xmx1500m" - "echo _JAVA_OPTIONS=$_JAVA_OPTIONS" -install: mvn dependency:resolve -B -V $PROJECTS +install: mvn dependency:resolve -B -V cache: directories: - $HOME/.m2 script: - - travis_wait 30 mvn install $PROJECTS + - travis_wait 30 mvn install after_failure: - cat */target/surefire-reports/*.txt diff --git a/pom.xml b/pom.xml index 033cd488b..fd89cb598 100644 --- a/pom.xml +++ b/pom.xml @@ -373,8 +373,8 @@ http://maven.apache.org/guides/mini/guide-m1-m2.html maven-compiler-plugin 3.3 - 1.7 - 1.7 + 1.8 + 1.8 UTF-8 From 4012a5f4dd392831f5057048ba5c62d985d74331 Mon Sep 17 00:00:00 2001 From: Alex Osborne Date: Tue, 6 Aug 2019 16:56:10 +0900 Subject: [PATCH 023/123] Fix 'No session data store configured' test error --- .../src/test/java/org/archive/modules/fetcher/FormAuthTest.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/engine/src/test/java/org/archive/modules/fetcher/FormAuthTest.java b/engine/src/test/java/org/archive/modules/fetcher/FormAuthTest.java index 0677cbc58..39ca0bb62 100644 --- a/engine/src/test/java/org/archive/modules/fetcher/FormAuthTest.java +++ b/engine/src/test/java/org/archive/modules/fetcher/FormAuthTest.java @@ -53,6 +53,7 @@ import org.eclipse.jetty.server.handler.HandlerCollection; import org.eclipse.jetty.server.handler.RequestLogHandler; import org.eclipse.jetty.server.session.DefaultSessionCache; +import org.eclipse.jetty.server.session.NullSessionDataStore; import org.eclipse.jetty.server.session.SessionCache; import org.eclipse.jetty.server.session.SessionHandler; import org.eclipse.jetty.util.security.Constraint; @@ -282,6 +283,7 @@ protected void startHttpServers() throws Exception { SessionHandler sessionHandler = new SessionHandler(); SessionCache cache = new DefaultSessionCache(sessionHandler); + cache.setSessionDataStore(new NullSessionDataStore()); sessionHandler.setSessionCache(cache); sessionHandler.setHandler(authWrapper); server.setHandler(sessionHandler); From 6c2afc36b0830e6614db96d9b1c438fadbc1e914 Mon Sep 17 00:00:00 2001 From: Alex Osborne Date: Tue, 6 Aug 2019 17:03:58 +0900 Subject: [PATCH 024/123] Update statistics assertions as Jetty 9 produces different headers --- .../crawler/selftest/StatisticsSelfTest.java | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/engine/src/test/java/org/archive/crawler/selftest/StatisticsSelfTest.java b/engine/src/test/java/org/archive/crawler/selftest/StatisticsSelfTest.java index 8d880bf34..ceea95170 100644 --- a/engine/src/test/java/org/archive/crawler/selftest/StatisticsSelfTest.java +++ b/engine/src/test/java/org/archive/crawler/selftest/StatisticsSelfTest.java @@ -48,12 +48,12 @@ protected void verifyWarcStats() { StatisticsTracker stats = heritrix.getEngine().getJob("selftest-job").getCrawlController().getStatisticsTracker(); assertNotNull(stats); assertEquals(13, (long) stats.getCrawledBytes().get(CrawledBytesHistotable.WARC_NOVEL_URLS)); - assertEquals(12669, (long) stats.getCrawledBytes().get(CrawledBytesHistotable.WARC_NOVEL_CONTENT_BYTES) - stats.getBytesPerHost("dns:")); + assertEquals(7501, (long) stats.getCrawledBytes().get(CrawledBytesHistotable.WARC_NOVEL_CONTENT_BYTES) - stats.getBytesPerHost("dns:")); assertEquals(3, (long) stats.getServerCache().getHostFor("127.0.0.1").getSubstats().get(CrawledBytesHistotable.WARC_NOVEL_URLS)); - assertEquals(2942, (long) stats.getServerCache().getHostFor("127.0.0.1").getSubstats().get(CrawledBytesHistotable.WARC_NOVEL_CONTENT_BYTES)); + assertEquals(2133, (long) stats.getServerCache().getHostFor("127.0.0.1").getSubstats().get(CrawledBytesHistotable.WARC_NOVEL_CONTENT_BYTES)); assertEquals(10, (long) stats.getServerCache().getHostFor("localhost").getSubstats().get(CrawledBytesHistotable.WARC_NOVEL_URLS)); - assertEquals(9727, (long) stats.getServerCache().getHostFor("localhost").getSubstats().get(CrawledBytesHistotable.WARC_NOVEL_CONTENT_BYTES)); + assertEquals(5368, (long) stats.getServerCache().getHostFor("localhost").getSubstats().get(CrawledBytesHistotable.WARC_NOVEL_CONTENT_BYTES)); assertEquals(0, (long) stats.getServerCache().getHostFor("dns:").getSubstats().get(CrawledBytesHistotable.WARC_NOVEL_URLS)); } @@ -66,17 +66,17 @@ protected void verifySourceStats() throws Exception { sourceStats = stats.getSourceStats("http://127.0.0.1:7777/a.html"); assertNotNull(sourceStats); assertEquals(4, sourceStats.keySet().size()); - assertEquals(2942l, (long) sourceStats.get("novel")); + assertEquals(2133l, (long) sourceStats.get("novel")); assertEquals(3l, (long) sourceStats.get("novelCount")); - assertEquals(2942l, (long) sourceStats.get("warcNovelContentBytes")); + assertEquals(2133l, (long) sourceStats.get("warcNovelContentBytes")); assertEquals(3l, (long) sourceStats.get("warcNovelUrls")); sourceStats = stats.getSourceStats("http://localhost:7777/b.html"); assertNotNull(sourceStats); assertEquals(4, sourceStats.keySet().size()); - assertEquals(9727l, (long) sourceStats.get("novel") - stats.getBytesPerHost("dns:")); + assertEquals(5368l, (long) sourceStats.get("novel") - stats.getBytesPerHost("dns:")); assertEquals(11l, (long) sourceStats.get("novelCount")); - assertEquals(9727l, (long) sourceStats.get("warcNovelContentBytes") - stats.getBytesPerHost("dns:")); + assertEquals(5368l, (long) sourceStats.get("warcNovelContentBytes") - stats.getBytesPerHost("dns:")); assertEquals(10l, (long) sourceStats.get("warcNovelUrls")); } From 97b9c3c91c9b2fdbe44d26a36627904e3e0422bf Mon Sep 17 00:00:00 2001 From: Alex Osborne Date: Tue, 6 Aug 2019 17:13:29 +0900 Subject: [PATCH 025/123] Fix restlet child component context warning --- engine/src/main/java/org/archive/crawler/Heritrix.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/engine/src/main/java/org/archive/crawler/Heritrix.java b/engine/src/main/java/org/archive/crawler/Heritrix.java index a737f1d78..ee0fcf79c 100644 --- a/engine/src/main/java/org/archive/crawler/Heritrix.java +++ b/engine/src/main/java/org/archive/crawler/Heritrix.java @@ -351,7 +351,7 @@ public void instanceMain(String[] args) MapVerifier verifier = new MapVerifier(); verifier.getLocalSecrets().put(authLogin, authPassword.toCharArray()); - ChallengeAuthenticator guard = new RateLimitGuard(component.getContext(), + ChallengeAuthenticator guard = new RateLimitGuard(component.getContext().createChildContext(), ChallengeScheme.HTTP_DIGEST, "Authentication Required"); guard.setVerifier(verifier); guard.setNext(new EngineApplication(engine)); From 8b72be7d1da2f5099ed9b2af9d0f1eb538cdcf0a Mon Sep 17 00:00:00 2001 From: Alex Osborne Date: Thu, 8 Aug 2019 10:37:38 +0900 Subject: [PATCH 026/123] Now jdk11 support is merged we can disallow test failures on it --- .travis.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index c3f0a33a6..e234b9187 100644 --- a/.travis.yml +++ b/.travis.yml @@ -8,8 +8,6 @@ matrix: dist: trusty - jdk: openjdk8 - jdk: openjdk11 - allow_failures: - - jdk: openjdk11 before_install: - "export JAVA_OPTS=-Xmx1500m" From 45cd46e030ad035a9acb6a3b8b6690b7025be75d Mon Sep 17 00:00:00 2001 From: Andrew Jackson Date: Sun, 11 Aug 2019 12:42:03 +0100 Subject: [PATCH 027/123] Try upping to BDB JE 7.5.11 --- commons/pom.xml | 2 +- pom.xml | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/commons/pom.xml b/commons/pom.xml index 3a121b022..f511770f5 100644 --- a/commons/pom.xml +++ b/commons/pom.xml @@ -34,7 +34,7 @@ com.sleepycat je - 4.1.6 + 7.5.11 commons-lang diff --git a/pom.xml b/pom.xml index 2cf5e5648..cd2c65b00 100644 --- a/pom.xml +++ b/pom.xml @@ -100,6 +100,15 @@ http://maven.apache.org/guides/mini/guide-m1-m2.html builds.archive.org,maven2 http://builds.archive.org/maven2 + + oracleReleases + Oracle Released Java Packages + http://download.oracle.com/maven + + true + + + From 7dab6ec2750095bdc6439daaf09458bb37522e36 Mon Sep 17 00:00:00 2001 From: Andrew Jackson Date: Sun, 11 Aug 2019 22:03:55 +0100 Subject: [PATCH 028/123] Avoid using Thread.interrupt as this freaks BDB-JE. --- .../modules/fetcher/CookieStoreTest.java | 21 +++++++++++-------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/modules/src/test/java/org/archive/modules/fetcher/CookieStoreTest.java b/modules/src/test/java/org/archive/modules/fetcher/CookieStoreTest.java index a621a57df..7d6da6192 100644 --- a/modules/src/test/java/org/archive/modules/fetcher/CookieStoreTest.java +++ b/modules/src/test/java/org/archive/modules/fetcher/CookieStoreTest.java @@ -30,6 +30,7 @@ import java.util.Map; import java.util.Random; import java.util.UUID; +import java.util.concurrent.atomic.AtomicBoolean; import java.util.logging.Logger; import org.apache.commons.io.FileUtils; @@ -301,12 +302,14 @@ public void testConcurrentLoadNoDomainCookieLimitBreach() throws IOException, In bdbCookieStore().clear(); basicCookieStore().clear(); final Random rand = new Random(); + + final AtomicBoolean keepRunning = new AtomicBoolean(true); Runnable runnable = new Runnable() { @Override public void run() { try { - while (!Thread.interrupted()) { + while (keepRunning.get()) { BasicClientCookie cookie = new BasicClientCookie(UUID.randomUUID().toString(), UUID.randomUUID().toString()); cookie.setDomain("d" + rand.nextInt() + ".example.com"); bdbCookieStore().addCookie(cookie); @@ -326,10 +329,9 @@ public void run() { } Thread.sleep(5000); - - for (int i = 0; i < threads.length; i++) { - threads[i].interrupt(); - } + + // Shutdown the threads: + keepRunning.set(false); for (int i = 0; i < threads.length; i++) { threads[i].join(); } @@ -343,12 +345,14 @@ public void testConcurrentLoad() throws IOException, InterruptedException { bdbCookieStore().clear(); basicCookieStore().clear(); final Random rand = new Random(); + + final AtomicBoolean keepRunning = new AtomicBoolean(true); Runnable runnable = new Runnable() { @Override public void run() { try { - while (!Thread.interrupted()) { + while (keepRunning.get()) { BasicClientCookie cookie = new BasicClientCookie(UUID.randomUUID().toString(), UUID.randomUUID().toString()); cookie.setDomain("d" + rand.nextInt(20) + ".example.com"); bdbCookieStore().addCookie(cookie); @@ -369,9 +373,8 @@ public void run() { Thread.sleep(1000); - for (int i = 0; i < threads.length; i++) { - threads[i].interrupt(); - } + // Shutdown the threads: + keepRunning.set(false); for (int i = 0; i < threads.length; i++) { threads[i].join(); } From 92ffdede0d017472e56dac64f622efdab5d3f3e2 Mon Sep 17 00:00:00 2001 From: Alex Osborne Date: Mon, 12 Aug 2019 15:03:52 +0900 Subject: [PATCH 029/123] Link to javadoc.io for more recent api docs --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index c1a47653f..2dac370fa 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # Heritrix [![Build Status](https://travis-ci.org/internetarchive/heritrix3.svg?branch=master)](https://travis-ci.org/internetarchive/heritrix3) [![Maven Central](https://maven-badges.herokuapp.com/maven-central/org.archive/heritrix/badge.svg)](https://maven-badges.herokuapp.com/maven-central/org.archive/heritrix) -[![Javadoc](https://javadoc-badge.appspot.com/org.archive/heritrix.svg?label=javadoc)](http://builds.archive.org/javadoc/heritrix-3.2.0/) +[![Javadoc](https://javadoc-badge.appspot.com/org.archive/heritrix.svg?label=javadoc)](https://www.javadoc.io/doc/org.archive.heritrix/heritrix-engine) [![LICENSE](https://img.shields.io/badge/license-Apache-blue.svg?style=flat-square)](./LICENSE) ## Introduction @@ -21,7 +21,7 @@ load your crawl will place on seed sites and set politeness policies accordingly - [Developer Manual](http://crawler.archive.org/articles/developer_manual/index.html) - [REST API documentation](https://heritrix.readthedocs.io/en/latest/api.html) -- [JavaDoc](http://builds.archive.org/javadoc/heritrix-3.2.0/) (n.b. Javadoc currently out of date) +- JavaDoc: [engine](https://www.javadoc.io/doc/org.archive.heritrix/heritrix-engine), [modules](https://www.javadoc.io/doc/org.archive.heritrix/heritrix-modules), [commons](https://www.javadoc.io/doc/org.archive.heritrix/heritrix-commons), [contrib](https://www.javadoc.io/doc/org.archive.heritrix/heritrix-contrib) ## Latest Releases From 36003b8626273fda6a643d719f2d973ef0393655 Mon Sep 17 00:00:00 2001 From: Alex Osborne Date: Mon, 12 Aug 2019 17:36:16 +0900 Subject: [PATCH 030/123] Fix digest authentication In Restlet 2 it appears we need to use DigestAuthenticator. (Previously both digest and basic auth were handled by the same Guard class.) --- .../src/main/java/org/archive/crawler/Heritrix.java | 6 +++--- .../org/archive/crawler/restlet/RateLimitGuard.java | 13 ++++++------- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/engine/src/main/java/org/archive/crawler/Heritrix.java b/engine/src/main/java/org/archive/crawler/Heritrix.java index ee0fcf79c..5714b9f6c 100644 --- a/engine/src/main/java/org/archive/crawler/Heritrix.java +++ b/engine/src/main/java/org/archive/crawler/Heritrix.java @@ -351,9 +351,9 @@ public void instanceMain(String[] args) MapVerifier verifier = new MapVerifier(); verifier.getLocalSecrets().put(authLogin, authPassword.toCharArray()); - ChallengeAuthenticator guard = new RateLimitGuard(component.getContext().createChildContext(), - ChallengeScheme.HTTP_DIGEST, "Authentication Required"); - guard.setVerifier(verifier); + RateLimitGuard guard = new RateLimitGuard(component.getContext().createChildContext(), + "Authentication Required", UUID.randomUUID().toString()); + guard.setWrappedVerifier(verifier); guard.setNext(new EngineApplication(engine)); component.getDefaultHost().attach(guard); diff --git a/engine/src/main/java/org/archive/crawler/restlet/RateLimitGuard.java b/engine/src/main/java/org/archive/crawler/restlet/RateLimitGuard.java index 042fd2ea6..7d9cd703e 100644 --- a/engine/src/main/java/org/archive/crawler/restlet/RateLimitGuard.java +++ b/engine/src/main/java/org/archive/crawler/restlet/RateLimitGuard.java @@ -18,13 +18,12 @@ */ package org.archive.crawler.restlet; -import java.util.logging.Logger; - import org.restlet.Context; import org.restlet.Request; import org.restlet.Response; -import org.restlet.data.ChallengeScheme; -import org.restlet.security.ChallengeAuthenticator; +import org.restlet.ext.crypto.DigestAuthenticator; + +import java.util.logging.Logger; /** * ChallengeAuthenticator that slows and logs failed authentication attempts, to make @@ -32,15 +31,15 @@ * * @author gojomo */ -public class RateLimitGuard extends ChallengeAuthenticator { +public class RateLimitGuard extends DigestAuthenticator { private static final int MIN_MS_BETWEEN_ATTEMPTS = 6000; private static final Logger logger = Logger.getLogger(RateLimitGuard.class.getName()); protected long lastFailureTime = 0; - public RateLimitGuard(Context context, ChallengeScheme scheme, String realm) throws IllegalArgumentException { - super(context, scheme, realm); + public RateLimitGuard(Context context, String realm, String serverKey) throws IllegalArgumentException { + super(context, realm, serverKey); } @Override From 38c44e8e8e6189bdd46e7ffeb80fa7657eb27fd5 Mon Sep 17 00:00:00 2001 From: Alex Osborne Date: Mon, 12 Aug 2019 17:50:35 +0900 Subject: [PATCH 031/123] Add missing UUID import (interactive commit fail) --- engine/src/main/java/org/archive/crawler/Heritrix.java | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/engine/src/main/java/org/archive/crawler/Heritrix.java b/engine/src/main/java/org/archive/crawler/Heritrix.java index 5714b9f6c..dd9e208ef 100644 --- a/engine/src/main/java/org/archive/crawler/Heritrix.java +++ b/engine/src/main/java/org/archive/crawler/Heritrix.java @@ -33,12 +33,7 @@ import java.security.KeyStore; import java.security.MessageDigest; import java.security.cert.Certificate; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.HashSet; -import java.util.List; -import java.util.Set; -import java.util.TimeZone; +import java.util.*; import java.util.logging.LogManager; import java.util.logging.Logger; From a5c03d9b73b000d61b8fc4874f9a90cc884a4937 Mon Sep 17 00:00:00 2001 From: Alex Osborne Date: Mon, 12 Aug 2019 18:08:43 +0900 Subject: [PATCH 032/123] Mitigate random CookieStore.testConcurrentLoad test failures The arbitary value `25` was used but in prace it's quite possible for more than 25 writing threads to have checked the cookie count limit before adding their cookie. In practice we see Travis failing on this test quite often, every few builds in fact. I think using `threads.length` (i.e. 200) should cover the worst case possibility where every thread reads a stale count and tries to add their cookie. Fixes #274 --- .../java/org/archive/modules/fetcher/CookieStoreTest.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/modules/src/test/java/org/archive/modules/fetcher/CookieStoreTest.java b/modules/src/test/java/org/archive/modules/fetcher/CookieStoreTest.java index a621a57df..37632a04a 100644 --- a/modules/src/test/java/org/archive/modules/fetcher/CookieStoreTest.java +++ b/modules/src/test/java/org/archive/modules/fetcher/CookieStoreTest.java @@ -388,7 +388,9 @@ public void run() { } for (String domain: domainCounts.keySet()) { - assertTrue(domainCounts.get(domain) <= BdbCookieStore.MAX_COOKIES_FOR_DOMAIN + 25); + // the cookie store intentionally doesn't synchronize so we allow up to thread.length + // additional cookies over the limit + assertTrue(domainCounts.get(domain) <= BdbCookieStore.MAX_COOKIES_FOR_DOMAIN + threads.length); } } From 2a5bd87e5114ec4becde464a4b5a709fdaf1e625 Mon Sep 17 00:00:00 2001 From: Alex Osborne Date: Fri, 16 Aug 2019 22:11:26 +0900 Subject: [PATCH 033/123] Remove testConcurrentLoad Noah wrote in #280: > Maybe we should just drop the test. The assumption when we wrote the > test was that a race condition would not be so frequent in practice. > We've seen that under the contrived conditions created by the test > case, it is frequent. But that's ok --- .../modules/fetcher/CookieStoreTest.java | 55 ------------------- 1 file changed, 55 deletions(-) diff --git a/modules/src/test/java/org/archive/modules/fetcher/CookieStoreTest.java b/modules/src/test/java/org/archive/modules/fetcher/CookieStoreTest.java index 37632a04a..35ff08d0c 100644 --- a/modules/src/test/java/org/archive/modules/fetcher/CookieStoreTest.java +++ b/modules/src/test/java/org/archive/modules/fetcher/CookieStoreTest.java @@ -338,62 +338,7 @@ public void run() { assertTrue(bdbCookieList.size() > 3000); assertCookieListsEquivalent(bdbCookieList, basicCookieStore().getCookies()); } - - public void testConcurrentLoad() throws IOException, InterruptedException { - bdbCookieStore().clear(); - basicCookieStore().clear(); - final Random rand = new Random(); - - Runnable runnable = new Runnable() { - @Override - public void run() { - try { - while (!Thread.interrupted()) { - BasicClientCookie cookie = new BasicClientCookie(UUID.randomUUID().toString(), UUID.randomUUID().toString()); - cookie.setDomain("d" + rand.nextInt(20) + ".example.com"); - bdbCookieStore().addCookie(cookie); - basicCookieStore().addCookie(cookie); - } - } catch (Exception e) { - throw new RuntimeException(e); - } - } - }; - - Thread[] threads = new Thread[200]; - for (int i = 0; i < threads.length; i++) { - threads[i] = new Thread(runnable); - threads[i].setName("cookie-load-test-" + i); - threads[i].start(); - } - - Thread.sleep(1000); - - for (int i = 0; i < threads.length; i++) { - threads[i].interrupt(); - } - for (int i = 0; i < threads.length; i++) { - threads[i].join(); - } - ArrayList bdbCookieArrayList = new ArrayList(bdbCookieStore().getCookies()); - Map domainCounts = new HashMap(); - for (Cookie cookie : bdbCookieArrayList) { - if (domainCounts.get(cookie.getDomain()) == null) { - domainCounts.put(cookie.getDomain(), 1); - } - else { - domainCounts.put(cookie.getDomain(), domainCounts.get(cookie.getDomain()) + 1); - } - } - - for (String domain: domainCounts.keySet()) { - // the cookie store intentionally doesn't synchronize so we allow up to thread.length - // additional cookies over the limit - assertTrue(domainCounts.get(domain) <= BdbCookieStore.MAX_COOKIES_FOR_DOMAIN + threads.length); - } - } - protected void assertCookieStoreCountEquals(BdbCookieStore bdb, int count) { assertEquals(bdb.getCookies().size(), count); } From 5612fa249b0d4263c948872594acfeccd792cf9d Mon Sep 17 00:00:00 2001 From: Alex Osborne Date: Tue, 27 Aug 2019 23:53:34 +0900 Subject: [PATCH 034/123] Use super.getVariants() rather than super.getVariants(GET) This was a regression introduced in the upgrade to Restlet 2. I encountered a NullPointerException here when upgrading and misunderstood the cause of it. Since PUT and DELETE return no content they are actually supposed to return null. --- .../crawler/restlet/EnhDirectoryResource.java | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/engine/src/main/java/org/archive/crawler/restlet/EnhDirectoryResource.java b/engine/src/main/java/org/archive/crawler/restlet/EnhDirectoryResource.java index 18c593543..05a0bc1b7 100644 --- a/engine/src/main/java/org/archive/crawler/restlet/EnhDirectoryResource.java +++ b/engine/src/main/java/org/archive/crawler/restlet/EnhDirectoryResource.java @@ -50,7 +50,11 @@ public class EnhDirectoryResource extends DirectoryServerResource { */ @Override public List getVariants() { - List variants = new LinkedList<>(super.getVariants(Method.GET)); + List superVariants = super.getVariants(); + if (superVariants == null) { + return null; // PUT and DELETE return no content + } + List variants = new LinkedList<>(superVariants); Form f = getRequest().getResourceRef().getQueryAsForm(); String format = f.getFirstValue("format"); if("textedit".equals(format)) { @@ -64,7 +68,11 @@ public List getVariants() { } catch (Exception e) { throw new RuntimeException(e); } - variants = new LinkedList<>(super.getVariants(Method.GET)); + superVariants = super.getVariants(); + if (superVariants == null) { + return null; + } + variants = new LinkedList<>(superVariants); } // wrap FileRepresentations in EditRepresentations ListIterator iter = variants.listIterator(); From eeddfd763f8a0199b01fa0119d24c335de128183 Mon Sep 17 00:00:00 2001 From: Alex Osborne Date: Wed, 28 Aug 2019 12:46:29 +0900 Subject: [PATCH 035/123] Override PUT so it doesn't change the file extension Fixes #282 and HER-1907 --- .../crawler/restlet/EnhDirectoryResource.java | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/engine/src/main/java/org/archive/crawler/restlet/EnhDirectoryResource.java b/engine/src/main/java/org/archive/crawler/restlet/EnhDirectoryResource.java index 05a0bc1b7..ffebd7e65 100644 --- a/engine/src/main/java/org/archive/crawler/restlet/EnhDirectoryResource.java +++ b/engine/src/main/java/org/archive/crawler/restlet/EnhDirectoryResource.java @@ -21,6 +21,8 @@ package org.archive.crawler.restlet; import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; import java.io.IOException; import java.net.URI; import java.util.LinkedList; @@ -153,4 +155,28 @@ protected Representation post(Representation entity, Variant variant) throws Res getResponse().redirectSeeOther(ref); return new EmptyRepresentation(); } + + /* + * XXX: We override Restlet's default PUT behaviour (see FileClientHelper.handleFilePut) as it unhelpfully changes + * the file extension based on the content-type and there's no apparent way to disable that. + */ + @Override + public Representation put(Representation entity) throws ResourceException { + File file = new File(URI.create(getTargetUri())); + if (getTargetUri().endsWith("/") || file.isDirectory()) { + return super.put(entity); + } + boolean created = !file.exists(); + try (FileOutputStream out = new FileOutputStream(file)) { + entity.write(out); + } catch (FileNotFoundException e) { + throw new ResourceException(Status.CLIENT_ERROR_NOT_FOUND, e); + } catch (IOException e) { + throw new ResourceException(Status.SERVER_ERROR_INTERNAL, e); + } + if (created) { + getResponse().setStatus(Status.SUCCESS_CREATED); + } + return new EmptyRepresentation(); + } } From 0cee24922fea80acd0c7ef357b32168e35f2c195 Mon Sep 17 00:00:00 2001 From: Adam Miller Date: Fri, 11 Oct 2019 19:44:35 +0000 Subject: [PATCH 036/123] AssignmentLevelSurtQueueAssignmentPolicy.java - Add support for forced queue assignment and parallel queues URIAuthorityBasedQueueAssignmentPolicy.java - Add interoperability between forced queue assignment and parallel queues QuotaEnforcer.java - Fix javadoc to match default behavior --- ...ignmentLevelSurtQueueAssignmentPolicy.java | 33 +++++++++++++++++-- ...RIAuthorityBasedQueueAssignmentPolicy.java | 8 ++--- .../crawler/prefetch/QuotaEnforcer.java | 2 +- 3 files changed, 35 insertions(+), 8 deletions(-) diff --git a/engine/src/main/java/org/archive/crawler/frontier/AssignmentLevelSurtQueueAssignmentPolicy.java b/engine/src/main/java/org/archive/crawler/frontier/AssignmentLevelSurtQueueAssignmentPolicy.java index d187bfc68..88603e9c2 100644 --- a/engine/src/main/java/org/archive/crawler/frontier/AssignmentLevelSurtQueueAssignmentPolicy.java +++ b/engine/src/main/java/org/archive/crawler/frontier/AssignmentLevelSurtQueueAssignmentPolicy.java @@ -18,7 +18,9 @@ */ package org.archive.crawler.frontier; +import org.apache.commons.lang.StringUtils; import org.archive.modules.CrawlURI; +import org.archive.net.UURI; import org.archive.net.PublicSuffixes; /** @@ -32,9 +34,34 @@ public class AssignmentLevelSurtQueueAssignmentPolicy extends private static final long serialVersionUID = -1533545293624791702L; @Override - public String getClassKey(CrawlURI cauri) { - String candidate = super.getClassKey(cauri); - candidate = PublicSuffixes.reduceSurtToAssignmentLevel(candidate); + public String getClassKey(CrawlURI curi) { + if(getDeferToPrevious() && !StringUtils.isEmpty(curi.getClassKey())) { + return curi.getClassKey(); + } + + UURI basis = curi.getPolicyBasisUURI(); + String candidate = super.getClassKey(curi); + candidate = PublicSuffixes.reduceSurtToAssignmentLevel(candidate); + + if(!StringUtils.isEmpty(getForceQueueAssignment())) { + candidate = getForceQueueAssignment(); + } + + // all whois urls in the same queue + if (curi.getUURI().getScheme().equals("whois")) { + return "whois..."; + } + + if(StringUtils.isEmpty(candidate)) { + return DEFAULT_CLASS_KEY; + } + if(getParallelQueues()>1) { + int subqueue = getSubqueue(basis,getParallelQueues()); + if (subqueue>0) { + candidate += "+"+subqueue; + } + } + return candidate; } diff --git a/engine/src/main/java/org/archive/crawler/frontier/URIAuthorityBasedQueueAssignmentPolicy.java b/engine/src/main/java/org/archive/crawler/frontier/URIAuthorityBasedQueueAssignmentPolicy.java index 8001540db..4fcec33e9 100644 --- a/engine/src/main/java/org/archive/crawler/frontier/URIAuthorityBasedQueueAssignmentPolicy.java +++ b/engine/src/main/java/org/archive/crawler/frontier/URIAuthorityBasedQueueAssignmentPolicy.java @@ -90,9 +90,12 @@ public String getClassKey(CrawlURI curi) { if(getDeferToPrevious() && !StringUtils.isEmpty(curi.getClassKey())) { return curi.getClassKey(); } + + UURI basis = curi.getPolicyBasisUURI(); + String candidate = getCoreKey(basis); if(!StringUtils.isEmpty(getForceQueueAssignment())) { - return getForceQueueAssignment(); + candidate = getForceQueueAssignment(); } // all whois urls in the same queue @@ -100,9 +103,6 @@ public String getClassKey(CrawlURI curi) { return "whois..."; } - UURI basis = curi.getPolicyBasisUURI(); - String candidate = getCoreKey(basis); - if(StringUtils.isEmpty(candidate)) { return DEFAULT_CLASS_KEY; } diff --git a/engine/src/main/java/org/archive/crawler/prefetch/QuotaEnforcer.java b/engine/src/main/java/org/archive/crawler/prefetch/QuotaEnforcer.java index d76302271..171bb5d0a 100644 --- a/engine/src/main/java/org/archive/crawler/prefetch/QuotaEnforcer.java +++ b/engine/src/main/java/org/archive/crawler/prefetch/QuotaEnforcer.java @@ -348,7 +348,7 @@ public void setGroupMaxNovelUrls(long max) { * being force-retired (if the Frontier supports this). Note that if your * queues combine URIs that are different with regard to the quota category, * the retirement may hold back URIs not in the same quota category. Default - * is false. + * is true. */ { setForceRetire(true); From 74c0739bb00bf084a64a44aefea0bd6bd452098e Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 11 Oct 2019 13:49:49 -0700 Subject: [PATCH 037/123] fix line ending and indentation issues --- .../AssignmentLevelSurtQueueAssignmentPolicy.java | 2 +- .../frontier/URIAuthorityBasedQueueAssignmentPolicy.java | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/engine/src/main/java/org/archive/crawler/frontier/AssignmentLevelSurtQueueAssignmentPolicy.java b/engine/src/main/java/org/archive/crawler/frontier/AssignmentLevelSurtQueueAssignmentPolicy.java index 88603e9c2..758751845 100644 --- a/engine/src/main/java/org/archive/crawler/frontier/AssignmentLevelSurtQueueAssignmentPolicy.java +++ b/engine/src/main/java/org/archive/crawler/frontier/AssignmentLevelSurtQueueAssignmentPolicy.java @@ -41,7 +41,7 @@ public String getClassKey(CrawlURI curi) { UURI basis = curi.getPolicyBasisUURI(); String candidate = super.getClassKey(curi); - candidate = PublicSuffixes.reduceSurtToAssignmentLevel(candidate); + candidate = PublicSuffixes.reduceSurtToAssignmentLevel(candidate); if(!StringUtils.isEmpty(getForceQueueAssignment())) { candidate = getForceQueueAssignment(); diff --git a/engine/src/main/java/org/archive/crawler/frontier/URIAuthorityBasedQueueAssignmentPolicy.java b/engine/src/main/java/org/archive/crawler/frontier/URIAuthorityBasedQueueAssignmentPolicy.java index 4fcec33e9..3212c4f07 100644 --- a/engine/src/main/java/org/archive/crawler/frontier/URIAuthorityBasedQueueAssignmentPolicy.java +++ b/engine/src/main/java/org/archive/crawler/frontier/URIAuthorityBasedQueueAssignmentPolicy.java @@ -90,12 +90,12 @@ public String getClassKey(CrawlURI curi) { if(getDeferToPrevious() && !StringUtils.isEmpty(curi.getClassKey())) { return curi.getClassKey(); } - - UURI basis = curi.getPolicyBasisUURI(); - String candidate = getCoreKey(basis); + + UURI basis = curi.getPolicyBasisUURI(); + String candidate = getCoreKey(basis); if(!StringUtils.isEmpty(getForceQueueAssignment())) { - candidate = getForceQueueAssignment(); + candidate = getForceQueueAssignment(); } // all whois urls in the same queue From 1b95453748f87024895dc4f4314074f5bde3b54d Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 15 Oct 2019 10:50:26 -0700 Subject: [PATCH 038/123] be consistent and null-safe with concurrentTo --- .../archive/modules/warc/DnsResponseRecordBuilder.java | 7 ++++++- .../modules/warc/FtpControlConversationRecordBuilder.java | 8 ++++++-- .../archive/modules/warc/FtpResponseRecordBuilder.java | 6 ++++-- .../archive/modules/warc/HttpRequestRecordBuilder.java | 6 ++++-- .../archive/modules/warc/HttpResponseRecordBuilder.java | 7 ++++++- .../org/archive/modules/warc/RevisitRecordBuilder.java | 7 ++++++- .../archive/modules/warc/WhoisResponseRecordBuilder.java | 7 ++++++- 7 files changed, 38 insertions(+), 10 deletions(-) diff --git a/modules/src/main/java/org/archive/modules/warc/DnsResponseRecordBuilder.java b/modules/src/main/java/org/archive/modules/warc/DnsResponseRecordBuilder.java index f982a6aaf..9a89eb674 100644 --- a/modules/src/main/java/org/archive/modules/warc/DnsResponseRecordBuilder.java +++ b/modules/src/main/java/org/archive/modules/warc/DnsResponseRecordBuilder.java @@ -1,5 +1,6 @@ package org.archive.modules.warc; +import static org.archive.format.warc.WARCConstants.HEADER_KEY_CONCURRENT_TO; import static org.archive.format.warc.WARCConstants.HEADER_KEY_IP; import static org.archive.modules.CoreAttributeConstants.A_DNS_SERVER_IP_LABEL; @@ -25,11 +26,15 @@ public WARCRecordInfo buildRecord(CrawlURI curi, URI concurrentTo) throws IOExce ArchiveUtils.getLog14Date(curi.getFetchBeginTime()); WARCRecordInfo recordInfo = new WARCRecordInfo(); + recordInfo.setRecordId(generateRecordID()); + if (concurrentTo != null) { + recordInfo.addExtraHeader(HEADER_KEY_CONCURRENT_TO, + '<' + concurrentTo.toString() + '>'); + } recordInfo.setType(WARCRecordType.response); recordInfo.setUrl(curi.toString()); recordInfo.setCreate14DigitDate(timestamp); recordInfo.setMimetype(curi.getContentType()); - recordInfo.setRecordId(generateRecordID()); recordInfo.setContentLength(curi.getRecorder().getRecordedInput().getSize()); recordInfo.setEnforceLength(true); diff --git a/modules/src/main/java/org/archive/modules/warc/FtpControlConversationRecordBuilder.java b/modules/src/main/java/org/archive/modules/warc/FtpControlConversationRecordBuilder.java index de8c978ae..6322c5a78 100644 --- a/modules/src/main/java/org/archive/modules/warc/FtpControlConversationRecordBuilder.java +++ b/modules/src/main/java/org/archive/modules/warc/FtpControlConversationRecordBuilder.java @@ -1,6 +1,7 @@ package org.archive.modules.warc; import static org.archive.format.warc.WARCConstants.FTP_CONTROL_CONVERSATION_MIMETYPE; +import static org.archive.format.warc.WARCConstants.HEADER_KEY_CONCURRENT_TO; import static org.archive.format.warc.WARCConstants.HEADER_KEY_IP; import static org.archive.modules.CoreAttributeConstants.A_FTP_CONTROL_CONVERSATION; @@ -31,14 +32,17 @@ public WARCRecordInfo buildRecord(CrawlURI curi, URI concurrentTo) throws IOExce headers.addLabelValue(HEADER_KEY_IP, getHostAddress(curi)); WARCRecordInfo recordInfo = new WARCRecordInfo(); + recordInfo.setRecordId(generateRecordID()); + if (concurrentTo != null) { + recordInfo.addExtraHeader(HEADER_KEY_CONCURRENT_TO, + '<' + concurrentTo.toString() + '>'); + } recordInfo.setCreate14DigitDate(timestamp); recordInfo.setUrl(curi.toString()); recordInfo.setMimetype(FTP_CONTROL_CONVERSATION_MIMETYPE); recordInfo.setExtraHeaders(headers); recordInfo.setEnforceLength(true); recordInfo.setType(WARCRecordType.metadata); - - recordInfo.setRecordId(generateRecordID()); byte[] b = controlConversation.getBytes("UTF-8"); diff --git a/modules/src/main/java/org/archive/modules/warc/FtpResponseRecordBuilder.java b/modules/src/main/java/org/archive/modules/warc/FtpResponseRecordBuilder.java index ddd9444be..80b63e72b 100644 --- a/modules/src/main/java/org/archive/modules/warc/FtpResponseRecordBuilder.java +++ b/modules/src/main/java/org/archive/modules/warc/FtpResponseRecordBuilder.java @@ -28,8 +28,10 @@ public WARCRecordInfo buildRecord(CrawlURI curi, URI concurrentTo) throws IOExce WARCRecordInfo recordInfo = new WARCRecordInfo(); recordInfo.setRecordId(generateRecordID()); - recordInfo.addExtraHeader(HEADER_KEY_CONCURRENT_TO, - '<' + concurrentTo.toString() + '>'); + if (concurrentTo != null) { + recordInfo.addExtraHeader(HEADER_KEY_CONCURRENT_TO, + '<' + concurrentTo.toString() + '>'); + } recordInfo.setType(WARCRecordType.response); recordInfo.setUrl(curi.toString()); recordInfo.setCreate14DigitDate(timestamp); diff --git a/modules/src/main/java/org/archive/modules/warc/HttpRequestRecordBuilder.java b/modules/src/main/java/org/archive/modules/warc/HttpRequestRecordBuilder.java index cd51e21de..eb9d95f06 100644 --- a/modules/src/main/java/org/archive/modules/warc/HttpRequestRecordBuilder.java +++ b/modules/src/main/java/org/archive/modules/warc/HttpRequestRecordBuilder.java @@ -27,8 +27,10 @@ public WARCRecordInfo buildRecord(CrawlURI curi, URI concurrentTo) WARCRecordInfo recordInfo = new WARCRecordInfo(); recordInfo.setRecordId(generateRecordID()); - recordInfo.addExtraHeader(HEADER_KEY_CONCURRENT_TO, - "<" + concurrentTo + ">"); + if (concurrentTo != null) { + recordInfo.addExtraHeader(HEADER_KEY_CONCURRENT_TO, + '<' + concurrentTo.toString() + '>'); + } recordInfo.setType(WARCRecordType.request); recordInfo.setUrl(curi.toString()); recordInfo.setCreate14DigitDate(timestamp); diff --git a/modules/src/main/java/org/archive/modules/warc/HttpResponseRecordBuilder.java b/modules/src/main/java/org/archive/modules/warc/HttpResponseRecordBuilder.java index 4b54ea26e..2034d14e6 100644 --- a/modules/src/main/java/org/archive/modules/warc/HttpResponseRecordBuilder.java +++ b/modules/src/main/java/org/archive/modules/warc/HttpResponseRecordBuilder.java @@ -1,5 +1,6 @@ package org.archive.modules.warc; +import static org.archive.format.warc.WARCConstants.HEADER_KEY_CONCURRENT_TO; import static org.archive.format.warc.WARCConstants.HEADER_KEY_PAYLOAD_DIGEST; import static org.archive.format.warc.WARCConstants.HEADER_KEY_TRUNCATED; import static org.archive.format.warc.WARCConstants.HTTP_RESPONSE_MIMETYPE; @@ -36,8 +37,12 @@ public WARCRecordInfo buildRecord(CrawlURI curi, URI concurrentTo) throws IOExce ArchiveUtils.getLog14Date(curi.getFetchBeginTime()); WARCRecordInfo recordInfo = new WARCRecordInfo(); - recordInfo.setType(WARCRecordType.response); recordInfo.setRecordId(generateRecordID()); + if (concurrentTo != null) { + recordInfo.addExtraHeader(HEADER_KEY_CONCURRENT_TO, + '<' + concurrentTo.toString() + '>'); + } + recordInfo.setType(WARCRecordType.response); recordInfo.setUrl(curi.toString()); recordInfo.setCreate14DigitDate(timestamp); recordInfo.setMimetype(HTTP_RESPONSE_MIMETYPE); diff --git a/modules/src/main/java/org/archive/modules/warc/RevisitRecordBuilder.java b/modules/src/main/java/org/archive/modules/warc/RevisitRecordBuilder.java index 6234dd5f8..f2382562c 100644 --- a/modules/src/main/java/org/archive/modules/warc/RevisitRecordBuilder.java +++ b/modules/src/main/java/org/archive/modules/warc/RevisitRecordBuilder.java @@ -1,5 +1,6 @@ package org.archive.modules.warc; +import static org.archive.format.warc.WARCConstants.HEADER_KEY_CONCURRENT_TO; import static org.archive.format.warc.WARCConstants.HEADER_KEY_PROFILE; import static org.archive.format.warc.WARCConstants.HEADER_KEY_TRUNCATED; import static org.archive.format.warc.WARCConstants.HTTP_RESPONSE_MIMETYPE; @@ -42,8 +43,12 @@ public WARCRecordInfo buildRecord(CrawlURI curi, URI concurrentTo) throws IOExce } WARCRecordInfo recordInfo = new WARCRecordInfo(); - recordInfo.setType(WARCRecordType.revisit); recordInfo.setRecordId(generateRecordID()); + if (concurrentTo != null) { + recordInfo.addExtraHeader(HEADER_KEY_CONCURRENT_TO, + '<' + concurrentTo.toString() + '>'); + } + recordInfo.setType(WARCRecordType.revisit); recordInfo.setUrl(curi.toString()); recordInfo.setCreate14DigitDate(timestamp); String scheme = curi.getUURI().getScheme().toLowerCase(); diff --git a/modules/src/main/java/org/archive/modules/warc/WhoisResponseRecordBuilder.java b/modules/src/main/java/org/archive/modules/warc/WhoisResponseRecordBuilder.java index 3726f500b..11fa07260 100644 --- a/modules/src/main/java/org/archive/modules/warc/WhoisResponseRecordBuilder.java +++ b/modules/src/main/java/org/archive/modules/warc/WhoisResponseRecordBuilder.java @@ -1,5 +1,6 @@ package org.archive.modules.warc; +import static org.archive.format.warc.WARCConstants.HEADER_KEY_CONCURRENT_TO; import static org.archive.format.warc.WARCConstants.HEADER_KEY_IP; import java.io.IOException; @@ -25,11 +26,15 @@ public WARCRecordInfo buildRecord(CrawlURI curi, URI concurrentTo) throws IOExce ArchiveUtils.getLog14Date(curi.getFetchBeginTime()); WARCRecordInfo recordInfo = new WARCRecordInfo(); + recordInfo.setRecordId(generateRecordID()); + if (concurrentTo != null) { + recordInfo.addExtraHeader(HEADER_KEY_CONCURRENT_TO, + '<' + concurrentTo.toString() + '>'); + } recordInfo.setType(WARCRecordType.response); recordInfo.setUrl(curi.toString()); recordInfo.setCreate14DigitDate(timestamp); recordInfo.setMimetype(curi.getContentType()); - recordInfo.setRecordId(generateRecordID()); recordInfo.setContentLength(curi.getRecorder().getRecordedInput().getSize()); recordInfo.setEnforceLength(true); From c611c46c8a931edc295e8ceae4deed846cc8cd58 Mon Sep 17 00:00:00 2001 From: Nicholas Clarke Date: Tue, 21 Nov 2017 13:44:54 +0100 Subject: [PATCH 039/123] Merged frontier-management with upgrade to bdb 7 --- .../archive/crawler/framework/CrawlJob.java | 53 +++++++++++++++++++ .../archive/crawler/framework/Frontier.java | 14 +++++ .../archive/crawler/frontier/BdbFrontier.java | 27 ++++++++++ .../frontier/BdbMultipleWorkQueues.java | 27 ++++++++++ .../crawler/prefetch/QuotaEnforcerTest.java | 19 +++++++ 5 files changed, 140 insertions(+) diff --git a/engine/src/main/java/org/archive/crawler/framework/CrawlJob.java b/engine/src/main/java/org/archive/crawler/framework/CrawlJob.java index c9aa8bb85..02bdbe2eb 100644 --- a/engine/src/main/java/org/archive/crawler/framework/CrawlJob.java +++ b/engine/src/main/java/org/archive/crawler/framework/CrawlJob.java @@ -20,18 +20,25 @@ package org.archive.crawler.framework; import java.io.BufferedReader; +import java.io.BufferedWriter; import java.io.File; import java.io.FileInputStream; +import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStreamReader; +import java.io.OutputStreamWriter; import java.io.PrintWriter; +import java.nio.charset.StandardCharsets; import java.util.Collections; import java.util.HashMap; import java.util.LinkedHashMap; import java.util.LinkedList; import java.util.List; import java.util.Map; +import java.util.Set; import java.util.TreeMap; +import java.util.concurrent.BlockingQueue; +import java.util.concurrent.Semaphore; import java.util.logging.FileHandler; import java.util.logging.Formatter; import java.util.logging.Handler; @@ -51,6 +58,7 @@ import org.apache.commons.lang.StringUtils; import org.archive.crawler.event.CrawlStateEvent; import org.archive.crawler.framework.CrawlController.StopCompleteEvent; +import org.archive.crawler.frontier.WorkQueue; import org.archive.crawler.reporting.AlertThreadGroup; import org.archive.crawler.reporting.CrawlStatSnapshot; import org.archive.crawler.reporting.StatisticsTracker; @@ -58,6 +66,7 @@ import org.archive.spring.ConfigPathConfigurer; import org.archive.spring.PathSharingContext; import org.archive.util.ArchiveUtils; +import org.archive.util.ObjectIdentityCache; import org.archive.util.TextUtils; import org.joda.time.DateTime; import org.springframework.beans.BeanWrapperImpl; @@ -970,4 +979,48 @@ public String getJobStatusDescription() { return "Finished: "+getCrawlController().getCrawlExitStatus(); } } + + protected Semaphore exportLock = new Semaphore(1); + + public long exportPendingUris() { + CrawlController cc = getCrawlController(); + if (cc==null) { + return -1L; + } + if (!cc.isPaused()) { + cc.requestCrawlPause(); + return -2L; + } + Frontier f = cc.getFrontier(); + if (f == null) { + return -3L; + } + long pendingUrisCount = 0L; + boolean bLocked = exportLock.tryAcquire(); + if (bLocked) { + try { + File outFile = new File(getJobDir(), "pendingUris.txt"); + if (outFile.exists()) { + outFile.delete(); + } + FileOutputStream out = new FileOutputStream(outFile); + OutputStreamWriter outStreamWriter = new OutputStreamWriter(out, StandardCharsets.UTF_8); + PrintWriter writer = new PrintWriter(new BufferedWriter(outStreamWriter, 65536)); + pendingUrisCount = f.exportPendingUris(writer); + writer.close(); + outStreamWriter.close(); + out.close(); + } + catch (IOException e) { + LOGGER.log(Level.SEVERE, e.getMessage(), e); + } + finally { + exportLock.release(); + } + } + else { + return -4L; + } + return pendingUrisCount; + } }//EOC diff --git a/engine/src/main/java/org/archive/crawler/framework/Frontier.java b/engine/src/main/java/org/archive/crawler/framework/Frontier.java index 69ac60d3f..f45207283 100644 --- a/engine/src/main/java/org/archive/crawler/framework/Frontier.java +++ b/engine/src/main/java/org/archive/crawler/framework/Frontier.java @@ -20,15 +20,20 @@ import java.io.File; import java.io.IOException; +import java.io.PrintWriter; +import java.util.Set; +import java.util.concurrent.BlockingQueue; import javax.management.openmbean.CompositeData; import org.archive.crawler.frontier.FrontierJournal; import org.archive.crawler.reporting.StatisticsTracker; +import org.archive.crawler.frontier.WorkQueue; import org.archive.modules.CrawlURI; import org.archive.modules.deciderules.DecideRule; import org.archive.modules.fetcher.FetchStats; import org.archive.util.IdentityCacheable; +import org.archive.util.ObjectIdentityCache; import org.archive.util.Reporter; import org.json.JSONException; import org.springframework.context.Lifecycle; @@ -522,4 +527,13 @@ public enum State { * conditions need to be free to call this 'just in case'. */ public void endDisposition(); + + public long exportPendingUris(PrintWriter writer); + + public ObjectIdentityCache getAllQueues(); + + public BlockingQueue getReadyClassQueues(); + + public Set getInProcessQueues(); + } diff --git a/engine/src/main/java/org/archive/crawler/frontier/BdbFrontier.java b/engine/src/main/java/org/archive/crawler/frontier/BdbFrontier.java index 5806df75a..d80ffd162 100644 --- a/engine/src/main/java/org/archive/crawler/frontier/BdbFrontier.java +++ b/engine/src/main/java/org/archive/crawler/frontier/BdbFrontier.java @@ -23,7 +23,9 @@ import java.io.PrintWriter; import java.util.Map.Entry; import java.util.Queue; +import java.util.Set; import java.util.SortedMap; +import java.util.concurrent.BlockingQueue; import java.util.concurrent.ConcurrentSkipListMap; import java.util.concurrent.DelayQueue; import java.util.concurrent.LinkedBlockingQueue; @@ -42,6 +44,7 @@ import org.archive.checkpointing.Checkpointable; import org.archive.modules.CrawlURI; import org.archive.util.ArchiveUtils; +import org.archive.util.ObjectIdentityCache; import org.archive.util.Supplier; import org.json.JSONArray; import org.json.JSONException; @@ -468,4 +471,28 @@ protected void consistencyMarkup( queueSummaries.put(key, val); } } + + @Override + public long exportPendingUris(PrintWriter writer) { + if (pendingUris == null) { + return -5L; + } + return pendingUris.exportPendingUris(writer); + } + + @Override + public ObjectIdentityCache getAllQueues() { + return allQueues; + } + + @Override + public BlockingQueue getReadyClassQueues() { + return readyClassQueues; + } + + @Override + public Set getInProcessQueues() { + return inProcessQueues; + } + } diff --git a/engine/src/main/java/org/archive/crawler/frontier/BdbMultipleWorkQueues.java b/engine/src/main/java/org/archive/crawler/frontier/BdbMultipleWorkQueues.java index 1f480c71b..a45712dc3 100644 --- a/engine/src/main/java/org/archive/crawler/frontier/BdbMultipleWorkQueues.java +++ b/engine/src/main/java/org/archive/crawler/frontier/BdbMultipleWorkQueues.java @@ -19,6 +19,7 @@ package org.archive.crawler.frontier; import java.io.IOException; +import java.io.PrintWriter; import java.io.UnsupportedEncodingException; import java.math.BigInteger; import java.util.ArrayList; @@ -558,4 +559,30 @@ protected void forAllPendingDo(Closure c) throws DatabaseException { } cursor.close(); } + + /** + * Run through all uris in the pending uris database and write them to the writer. + * @param writer destination writer for writting all the uris + * @return number of uris written to the writer + */ + public long exportPendingUris(PrintWriter writer) { + if (this.pendingUrisDB == null) { + return -6L; + } + sync(); + DatabaseEntry key = new DatabaseEntry(); + DatabaseEntry value = new DatabaseEntry(); + long uris = 0L; + Cursor cursor = pendingUrisDB.openCursor(null, null); + while (cursor.getNext(key, value, null) == OperationStatus.SUCCESS) { + if (value.getData().length == 0) { + continue; + } + CrawlURI item = (CrawlURI) crawlUriBinding.entryToObject(value); + writer.println(item.toString()); + ++uris; + } + cursor.close(); + return uris; + } } diff --git a/engine/src/test/java/org/archive/crawler/prefetch/QuotaEnforcerTest.java b/engine/src/test/java/org/archive/crawler/prefetch/QuotaEnforcerTest.java index 1db167437..8f2099683 100644 --- a/engine/src/test/java/org/archive/crawler/prefetch/QuotaEnforcerTest.java +++ b/engine/src/test/java/org/archive/crawler/prefetch/QuotaEnforcerTest.java @@ -24,6 +24,8 @@ import java.io.PrintWriter; import java.util.HashMap; import java.util.Map; +import java.util.Set; +import java.util.concurrent.BlockingQueue; import javax.management.openmbean.CompositeData; @@ -32,6 +34,7 @@ import org.archive.crawler.framework.Frontier; import org.archive.crawler.framework.Frontier.FrontierGroup; import org.archive.crawler.frontier.FrontierJournal; +import org.archive.crawler.frontier.WorkQueue; import org.archive.modules.CoreAttributeConstants; import org.archive.modules.CrawlURI; import org.archive.modules.ProcessResult; @@ -289,6 +292,22 @@ public void beginDisposition(CrawlURI curi) { @Override public void endDisposition() { } + @Override + public long exportPendingUris(PrintWriter writer) { + return 0; + } + @Override + public ObjectIdentityCache getAllQueues() { + return null; + } + @Override + public BlockingQueue getReadyClassQueues() { + return null; + } + @Override + public Set getInProcessQueues() { + return null; + } } // separate methods to make it easier to know what failed From f22a97875d4d1d7e6517df9c01e4ac096eb11896 Mon Sep 17 00:00:00 2001 From: Colin Rosenthal Date: Fri, 15 Nov 2019 14:31:40 +0100 Subject: [PATCH 040/123] Added a timeout to crawlertrap regex matching --- dist/heritrix.iml | 85 +++++++++++++++++++ .../MatchesListRegexDecideRule.java | 30 ++++++- .../MatchesListRegexDecideRuleTest.java | 34 ++++++++ 3 files changed, 147 insertions(+), 2 deletions(-) create mode 100644 dist/heritrix.iml create mode 100644 modules/src/test/java/org/archive/modules/deciderules/MatchesListRegexDecideRuleTest.java diff --git a/dist/heritrix.iml b/dist/heritrix.iml new file mode 100644 index 000000000..f1b8c3863 --- /dev/null +++ b/dist/heritrix.iml @@ -0,0 +1,85 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/modules/src/main/java/org/archive/modules/deciderules/MatchesListRegexDecideRule.java b/modules/src/main/java/org/archive/modules/deciderules/MatchesListRegexDecideRule.java index 1344b30b2..2aedcc23d 100644 --- a/modules/src/main/java/org/archive/modules/deciderules/MatchesListRegexDecideRule.java +++ b/modules/src/main/java/org/archive/modules/deciderules/MatchesListRegexDecideRule.java @@ -20,6 +20,10 @@ import java.util.ArrayList; import java.util.List; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; import java.util.logging.Level; import java.util.logging.Logger; import java.util.regex.Pattern; @@ -39,7 +43,19 @@ public class MatchesListRegexDecideRule extends PredicatedDecideRule { private static final long serialVersionUID = 3L; private static final Logger logger = - Logger.getLogger(MatchesListRegexDecideRule.class.getName()); + Logger.getLogger(MatchesListRegexDecideRule.class.getName()); + + /** + * The timeout for regular expression matching, in seconds. If set to 0 or negative then no timeout is specified and + * there is no upper limit to how long the matching may take. See the corresponding test class MatchesListRegexDecideRuleTest + * for a pathological example. + */ + { + setTimeoutPerRegexSeconds(0L); + } + public long getTimeoutPerRegexSeconds() { return (Long) kp.get("timeout");} + public void setTimeoutPerRegexSeconds(long timeoutPerRegexSeconds) { kp.put("timeout", timeoutPerRegexSeconds);} + /** * The list of regular expressions to evalute against the URI. @@ -91,7 +107,17 @@ protected boolean evaluate(CrawlURI uri) { boolean listLogicOR = getListLogicalOr(); for (Pattern p: regexes) { - boolean matches = p.matcher(str).matches(); + boolean matches = false; + if (getTimeoutPerRegexSeconds() <= 0) { + matches = p.matcher(str).matches(); + } else { + CompletableFuture matchesFuture = CompletableFuture.supplyAsync(() -> p.matcher(str).matches()); + try { + matches = matchesFuture.get(getTimeoutPerRegexSeconds(), TimeUnit.SECONDS); + } catch (Exception e) { + logger.info("Exception while matching regex '" + p + "' to url '" + str + "' so assuming no match. " + e.getClass().getName()); + } + } if (logger.isLoggable(Level.FINER)) { logger.finer("Tested '" + str + "' match with regex '" + diff --git a/modules/src/test/java/org/archive/modules/deciderules/MatchesListRegexDecideRuleTest.java b/modules/src/test/java/org/archive/modules/deciderules/MatchesListRegexDecideRuleTest.java new file mode 100644 index 000000000..e70195007 --- /dev/null +++ b/modules/src/test/java/org/archive/modules/deciderules/MatchesListRegexDecideRuleTest.java @@ -0,0 +1,34 @@ +package org.archive.modules.deciderules; + +import junit.framework.TestCase; +import org.apache.commons.httpclient.URIException; +import org.archive.modules.CrawlURI; +import org.archive.net.UURIFactory; + +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Pattern; + +public class MatchesListRegexDecideRuleTest extends TestCase { + + /** + * Not easy to test this code in older versions of junit. Basically with the timeout set to "0", this method + * will never return. + * @throws URIException + */ + public void xtestEvaluate() throws URIException { + final String regex = "http://www\\.netarkivet\\.dk/((x+x+)+)y"; + String seed = "http://www.netarkivet.dk/xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"; + MatchesListRegexDecideRule rule = new MatchesListRegexDecideRule(); + List patternList = new ArrayList<>(); + patternList.add(Pattern.compile(regex)); + rule.setRegexList(patternList); + rule.setEnabled(true); + rule.setListLogicalOr(true); + rule.setDecision(DecideResult.REJECT); + rule.setTimeoutPerRegexSeconds(2); + final CrawlURI curi = new CrawlURI(UURIFactory.getInstance(seed)); + final DecideResult decideResult = rule.decisionFor(curi); + assertEquals("Expected NONE not " + decideResult , DecideResult.NONE, decideResult); + } +} \ No newline at end of file From 718cc5ae3258f93752d999eab35b53f0871c0810 Mon Sep 17 00:00:00 2001 From: Colin Rosenthal Date: Fri, 15 Nov 2019 14:34:03 +0100 Subject: [PATCH 041/123] Updated manually to new SNAPSHOT version --- .../MatchesListRegexDecideRule.java | 30 ++++++++++++++-- .../MatchesListRegexDecideRuleTest.java | 34 +++++++++++++++++++ 2 files changed, 62 insertions(+), 2 deletions(-) create mode 100644 modules/src/test/java/org/archive/modules/deciderules/MatchesListRegexDecideRuleTest.java diff --git a/modules/src/main/java/org/archive/modules/deciderules/MatchesListRegexDecideRule.java b/modules/src/main/java/org/archive/modules/deciderules/MatchesListRegexDecideRule.java index 1344b30b2..2aedcc23d 100644 --- a/modules/src/main/java/org/archive/modules/deciderules/MatchesListRegexDecideRule.java +++ b/modules/src/main/java/org/archive/modules/deciderules/MatchesListRegexDecideRule.java @@ -20,6 +20,10 @@ import java.util.ArrayList; import java.util.List; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; import java.util.logging.Level; import java.util.logging.Logger; import java.util.regex.Pattern; @@ -39,7 +43,19 @@ public class MatchesListRegexDecideRule extends PredicatedDecideRule { private static final long serialVersionUID = 3L; private static final Logger logger = - Logger.getLogger(MatchesListRegexDecideRule.class.getName()); + Logger.getLogger(MatchesListRegexDecideRule.class.getName()); + + /** + * The timeout for regular expression matching, in seconds. If set to 0 or negative then no timeout is specified and + * there is no upper limit to how long the matching may take. See the corresponding test class MatchesListRegexDecideRuleTest + * for a pathological example. + */ + { + setTimeoutPerRegexSeconds(0L); + } + public long getTimeoutPerRegexSeconds() { return (Long) kp.get("timeout");} + public void setTimeoutPerRegexSeconds(long timeoutPerRegexSeconds) { kp.put("timeout", timeoutPerRegexSeconds);} + /** * The list of regular expressions to evalute against the URI. @@ -91,7 +107,17 @@ protected boolean evaluate(CrawlURI uri) { boolean listLogicOR = getListLogicalOr(); for (Pattern p: regexes) { - boolean matches = p.matcher(str).matches(); + boolean matches = false; + if (getTimeoutPerRegexSeconds() <= 0) { + matches = p.matcher(str).matches(); + } else { + CompletableFuture matchesFuture = CompletableFuture.supplyAsync(() -> p.matcher(str).matches()); + try { + matches = matchesFuture.get(getTimeoutPerRegexSeconds(), TimeUnit.SECONDS); + } catch (Exception e) { + logger.info("Exception while matching regex '" + p + "' to url '" + str + "' so assuming no match. " + e.getClass().getName()); + } + } if (logger.isLoggable(Level.FINER)) { logger.finer("Tested '" + str + "' match with regex '" + diff --git a/modules/src/test/java/org/archive/modules/deciderules/MatchesListRegexDecideRuleTest.java b/modules/src/test/java/org/archive/modules/deciderules/MatchesListRegexDecideRuleTest.java new file mode 100644 index 000000000..e70195007 --- /dev/null +++ b/modules/src/test/java/org/archive/modules/deciderules/MatchesListRegexDecideRuleTest.java @@ -0,0 +1,34 @@ +package org.archive.modules.deciderules; + +import junit.framework.TestCase; +import org.apache.commons.httpclient.URIException; +import org.archive.modules.CrawlURI; +import org.archive.net.UURIFactory; + +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Pattern; + +public class MatchesListRegexDecideRuleTest extends TestCase { + + /** + * Not easy to test this code in older versions of junit. Basically with the timeout set to "0", this method + * will never return. + * @throws URIException + */ + public void xtestEvaluate() throws URIException { + final String regex = "http://www\\.netarkivet\\.dk/((x+x+)+)y"; + String seed = "http://www.netarkivet.dk/xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"; + MatchesListRegexDecideRule rule = new MatchesListRegexDecideRule(); + List patternList = new ArrayList<>(); + patternList.add(Pattern.compile(regex)); + rule.setRegexList(patternList); + rule.setEnabled(true); + rule.setListLogicalOr(true); + rule.setDecision(DecideResult.REJECT); + rule.setTimeoutPerRegexSeconds(2); + final CrawlURI curi = new CrawlURI(UURIFactory.getInstance(seed)); + final DecideResult decideResult = rule.decisionFor(curi); + assertEquals("Expected NONE not " + decideResult , DecideResult.NONE, decideResult); + } +} \ No newline at end of file From bf5842c02ed3516713d792b358c7c0a7c8bc52bd Mon Sep 17 00:00:00 2001 From: Colin Rosenthal Date: Fri, 15 Nov 2019 14:37:11 +0100 Subject: [PATCH 042/123] removed .iml file --- .../Maven__com_101tec_zkclient_0_7.xml | 13 +++ ...Maven__com_google_code_gson_gson_2_2_4.xml | 13 +++ ...oglecode_json_simple_json_simple_1_1_1.xml | 13 +++ ...__com_rethinkdb_rethinkdb_driver_2_3_3.xml | 13 +++ .../Maven__com_sleepycat_je_4_1_6.xml | 13 +++ ...un_istack_istack_commons_runtime_3_0_7.xml | 13 +++ ...sun_xml_fastinfoset_FastInfoset_1_2_15.xml | 13 +++ ..._activation_javax_activation_api_1_2_0.xml | 13 +++ ..._javax_servlet_javax_servlet_api_3_1_0.xml | 13 +++ .../Maven__javax_xml_bind_jaxb_api_2_3_1.xml | 13 +++ .idea/libraries/Maven__junit_junit_4_10.xml | 13 +++ ...n__org_apache_avro_avro_1_7_6_cdh5_3_5.xml | 13 +++ ...rg_apache_curator_curator_client_2_6_0.xml | 13 +++ ...apache_curator_curator_framework_2_6_0.xml | 13 +++ ...g_apache_curator_curator_recipes_2_6_0.xml | 13 +++ ...e_directory_api_api_asn1_api_1_0_0_M20.xml | 13 +++ ...pache_directory_api_api_util_1_0_0_M20.xml | 13 +++ ...rectory_server_apacheds_i18n_2_0_0_M15.xml | 13 +++ ...rver_apacheds_kerberos_codec_2_0_0_M15.xml | 13 +++ ...doop_hadoop_annotations_2_5_0_cdh5_3_5.xml | 13 +++ ...ache_hadoop_hadoop_auth_2_5_0_cdh5_3_5.xml | 13 +++ ...he_hadoop_hadoop_common_2_5_0_cdh5_3_5.xml | 13 +++ ..._hadoop_hadoop_core_2_5_0_mr1_cdh5_3_5.xml | 13 +++ ...che_hbase_hbase_client_0_98_6_cdh5_3_5.xml | 13 +++ ...che_hbase_hbase_common_0_98_6_cdh5_3_5.xml | 13 +++ ...e_hbase_hbase_protocol_0_98_6_cdh5_3_5.xml | 13 +++ ...n__org_apache_kafka_kafka_2_10_0_9_0_0.xml | 13 +++ ...org_apache_kafka_kafka_clients_0_9_0_0.xml | 13 +++ ...che_zookeeper_zookeeper_3_4_5_cdh5_3_5.xml | 13 +++ ...__org_cloudera_htrace_htrace_core_2_04.xml | 13 +++ ...ipse_jetty_jetty_http_9_4_19_v20190610.xml | 13 +++ ...clipse_jetty_jetty_io_9_4_19_v20190610.xml | 13 +++ ..._jetty_jetty_security_9_4_19_v20190610.xml | 13 +++ ...se_jetty_jetty_server_9_4_19_v20190610.xml | 13 +++ ...e_jetty_jetty_servlet_9_4_19_v20190610.xml | 13 +++ ...ipse_jetty_jetty_util_9_4_19_v20190610.xml | 13 +++ ..._org_glassfish_jaxb_jaxb_runtime_2_3_1.xml | 13 +++ .../Maven__org_glassfish_jaxb_txw2_2_3_1.xml | 13 +++ .../Maven__org_hamcrest_hamcrest_core_1_1.xml | 13 +++ .../Maven__org_jvnet_staxex_stax_ex_1_8.xml | 13 +++ ..._mortbay_jetty_jetty_6_1_26_cloudera_4.xml | 13 +++ ...serve_commons_webarchive_commons_1_1_8.xml | 13 +++ ...ven__org_restlet_jse_org_restlet_2_4_0.xml | 13 +++ ...stlet_jse_org_restlet_ext_crypto_2_4_0.xml | 13 +++ ...estlet_jse_org_restlet_ext_jetty_2_4_0.xml | 13 +++ ..._restlet_jse_org_restlet_ext_xml_2_4_0.xml | 13 +++ ...n__org_scala_lang_scala_library_2_10_5.xml | 13 +++ .../Maven__org_slf4j_slf4j_api_1_7_12.xml | 13 +++ .../Maven__org_slf4j_slf4j_log4j12_1_7_6.xml | 13 +++ ..._org_xerial_snappy_snappy_java_1_1_1_7.xml | 13 +++ dist/heritrix.iml | 85 ------------------- 51 files changed, 650 insertions(+), 85 deletions(-) create mode 100644 .idea/libraries/Maven__com_101tec_zkclient_0_7.xml create mode 100644 .idea/libraries/Maven__com_google_code_gson_gson_2_2_4.xml create mode 100644 .idea/libraries/Maven__com_googlecode_json_simple_json_simple_1_1_1.xml create mode 100644 .idea/libraries/Maven__com_rethinkdb_rethinkdb_driver_2_3_3.xml create mode 100644 .idea/libraries/Maven__com_sleepycat_je_4_1_6.xml create mode 100644 .idea/libraries/Maven__com_sun_istack_istack_commons_runtime_3_0_7.xml create mode 100644 .idea/libraries/Maven__com_sun_xml_fastinfoset_FastInfoset_1_2_15.xml create mode 100644 .idea/libraries/Maven__javax_activation_javax_activation_api_1_2_0.xml create mode 100644 .idea/libraries/Maven__javax_servlet_javax_servlet_api_3_1_0.xml create mode 100644 .idea/libraries/Maven__javax_xml_bind_jaxb_api_2_3_1.xml create mode 100644 .idea/libraries/Maven__junit_junit_4_10.xml create mode 100644 .idea/libraries/Maven__org_apache_avro_avro_1_7_6_cdh5_3_5.xml create mode 100644 .idea/libraries/Maven__org_apache_curator_curator_client_2_6_0.xml create mode 100644 .idea/libraries/Maven__org_apache_curator_curator_framework_2_6_0.xml create mode 100644 .idea/libraries/Maven__org_apache_curator_curator_recipes_2_6_0.xml create mode 100644 .idea/libraries/Maven__org_apache_directory_api_api_asn1_api_1_0_0_M20.xml create mode 100644 .idea/libraries/Maven__org_apache_directory_api_api_util_1_0_0_M20.xml create mode 100644 .idea/libraries/Maven__org_apache_directory_server_apacheds_i18n_2_0_0_M15.xml create mode 100644 .idea/libraries/Maven__org_apache_directory_server_apacheds_kerberos_codec_2_0_0_M15.xml create mode 100644 .idea/libraries/Maven__org_apache_hadoop_hadoop_annotations_2_5_0_cdh5_3_5.xml create mode 100644 .idea/libraries/Maven__org_apache_hadoop_hadoop_auth_2_5_0_cdh5_3_5.xml create mode 100644 .idea/libraries/Maven__org_apache_hadoop_hadoop_common_2_5_0_cdh5_3_5.xml create mode 100644 .idea/libraries/Maven__org_apache_hadoop_hadoop_core_2_5_0_mr1_cdh5_3_5.xml create mode 100644 .idea/libraries/Maven__org_apache_hbase_hbase_client_0_98_6_cdh5_3_5.xml create mode 100644 .idea/libraries/Maven__org_apache_hbase_hbase_common_0_98_6_cdh5_3_5.xml create mode 100644 .idea/libraries/Maven__org_apache_hbase_hbase_protocol_0_98_6_cdh5_3_5.xml create mode 100644 .idea/libraries/Maven__org_apache_kafka_kafka_2_10_0_9_0_0.xml create mode 100644 .idea/libraries/Maven__org_apache_kafka_kafka_clients_0_9_0_0.xml create mode 100644 .idea/libraries/Maven__org_apache_zookeeper_zookeeper_3_4_5_cdh5_3_5.xml create mode 100644 .idea/libraries/Maven__org_cloudera_htrace_htrace_core_2_04.xml create mode 100644 .idea/libraries/Maven__org_eclipse_jetty_jetty_http_9_4_19_v20190610.xml create mode 100644 .idea/libraries/Maven__org_eclipse_jetty_jetty_io_9_4_19_v20190610.xml create mode 100644 .idea/libraries/Maven__org_eclipse_jetty_jetty_security_9_4_19_v20190610.xml create mode 100644 .idea/libraries/Maven__org_eclipse_jetty_jetty_server_9_4_19_v20190610.xml create mode 100644 .idea/libraries/Maven__org_eclipse_jetty_jetty_servlet_9_4_19_v20190610.xml create mode 100644 .idea/libraries/Maven__org_eclipse_jetty_jetty_util_9_4_19_v20190610.xml create mode 100644 .idea/libraries/Maven__org_glassfish_jaxb_jaxb_runtime_2_3_1.xml create mode 100644 .idea/libraries/Maven__org_glassfish_jaxb_txw2_2_3_1.xml create mode 100644 .idea/libraries/Maven__org_hamcrest_hamcrest_core_1_1.xml create mode 100644 .idea/libraries/Maven__org_jvnet_staxex_stax_ex_1_8.xml create mode 100644 .idea/libraries/Maven__org_mortbay_jetty_jetty_6_1_26_cloudera_4.xml create mode 100644 .idea/libraries/Maven__org_netpreserve_commons_webarchive_commons_1_1_8.xml create mode 100644 .idea/libraries/Maven__org_restlet_jse_org_restlet_2_4_0.xml create mode 100644 .idea/libraries/Maven__org_restlet_jse_org_restlet_ext_crypto_2_4_0.xml create mode 100644 .idea/libraries/Maven__org_restlet_jse_org_restlet_ext_jetty_2_4_0.xml create mode 100644 .idea/libraries/Maven__org_restlet_jse_org_restlet_ext_xml_2_4_0.xml create mode 100644 .idea/libraries/Maven__org_scala_lang_scala_library_2_10_5.xml create mode 100644 .idea/libraries/Maven__org_slf4j_slf4j_api_1_7_12.xml create mode 100644 .idea/libraries/Maven__org_slf4j_slf4j_log4j12_1_7_6.xml create mode 100644 .idea/libraries/Maven__org_xerial_snappy_snappy_java_1_1_1_7.xml delete mode 100644 dist/heritrix.iml diff --git a/.idea/libraries/Maven__com_101tec_zkclient_0_7.xml b/.idea/libraries/Maven__com_101tec_zkclient_0_7.xml new file mode 100644 index 000000000..a23acb560 --- /dev/null +++ b/.idea/libraries/Maven__com_101tec_zkclient_0_7.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__com_google_code_gson_gson_2_2_4.xml b/.idea/libraries/Maven__com_google_code_gson_gson_2_2_4.xml new file mode 100644 index 000000000..c91e18862 --- /dev/null +++ b/.idea/libraries/Maven__com_google_code_gson_gson_2_2_4.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__com_googlecode_json_simple_json_simple_1_1_1.xml b/.idea/libraries/Maven__com_googlecode_json_simple_json_simple_1_1_1.xml new file mode 100644 index 000000000..a99f3a2eb --- /dev/null +++ b/.idea/libraries/Maven__com_googlecode_json_simple_json_simple_1_1_1.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__com_rethinkdb_rethinkdb_driver_2_3_3.xml b/.idea/libraries/Maven__com_rethinkdb_rethinkdb_driver_2_3_3.xml new file mode 100644 index 000000000..6ed629942 --- /dev/null +++ b/.idea/libraries/Maven__com_rethinkdb_rethinkdb_driver_2_3_3.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__com_sleepycat_je_4_1_6.xml b/.idea/libraries/Maven__com_sleepycat_je_4_1_6.xml new file mode 100644 index 000000000..22f18996a --- /dev/null +++ b/.idea/libraries/Maven__com_sleepycat_je_4_1_6.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__com_sun_istack_istack_commons_runtime_3_0_7.xml b/.idea/libraries/Maven__com_sun_istack_istack_commons_runtime_3_0_7.xml new file mode 100644 index 000000000..a9a7ec3dc --- /dev/null +++ b/.idea/libraries/Maven__com_sun_istack_istack_commons_runtime_3_0_7.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__com_sun_xml_fastinfoset_FastInfoset_1_2_15.xml b/.idea/libraries/Maven__com_sun_xml_fastinfoset_FastInfoset_1_2_15.xml new file mode 100644 index 000000000..ec6faea39 --- /dev/null +++ b/.idea/libraries/Maven__com_sun_xml_fastinfoset_FastInfoset_1_2_15.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__javax_activation_javax_activation_api_1_2_0.xml b/.idea/libraries/Maven__javax_activation_javax_activation_api_1_2_0.xml new file mode 100644 index 000000000..5b2b25be0 --- /dev/null +++ b/.idea/libraries/Maven__javax_activation_javax_activation_api_1_2_0.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__javax_servlet_javax_servlet_api_3_1_0.xml b/.idea/libraries/Maven__javax_servlet_javax_servlet_api_3_1_0.xml new file mode 100644 index 000000000..40d82ec8d --- /dev/null +++ b/.idea/libraries/Maven__javax_servlet_javax_servlet_api_3_1_0.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__javax_xml_bind_jaxb_api_2_3_1.xml b/.idea/libraries/Maven__javax_xml_bind_jaxb_api_2_3_1.xml new file mode 100644 index 000000000..870581996 --- /dev/null +++ b/.idea/libraries/Maven__javax_xml_bind_jaxb_api_2_3_1.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__junit_junit_4_10.xml b/.idea/libraries/Maven__junit_junit_4_10.xml new file mode 100644 index 000000000..f126b9cb5 --- /dev/null +++ b/.idea/libraries/Maven__junit_junit_4_10.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_avro_avro_1_7_6_cdh5_3_5.xml b/.idea/libraries/Maven__org_apache_avro_avro_1_7_6_cdh5_3_5.xml new file mode 100644 index 000000000..186913dc3 --- /dev/null +++ b/.idea/libraries/Maven__org_apache_avro_avro_1_7_6_cdh5_3_5.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_curator_curator_client_2_6_0.xml b/.idea/libraries/Maven__org_apache_curator_curator_client_2_6_0.xml new file mode 100644 index 000000000..6a284b7d8 --- /dev/null +++ b/.idea/libraries/Maven__org_apache_curator_curator_client_2_6_0.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_curator_curator_framework_2_6_0.xml b/.idea/libraries/Maven__org_apache_curator_curator_framework_2_6_0.xml new file mode 100644 index 000000000..7c1622082 --- /dev/null +++ b/.idea/libraries/Maven__org_apache_curator_curator_framework_2_6_0.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_curator_curator_recipes_2_6_0.xml b/.idea/libraries/Maven__org_apache_curator_curator_recipes_2_6_0.xml new file mode 100644 index 000000000..35631fe16 --- /dev/null +++ b/.idea/libraries/Maven__org_apache_curator_curator_recipes_2_6_0.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_directory_api_api_asn1_api_1_0_0_M20.xml b/.idea/libraries/Maven__org_apache_directory_api_api_asn1_api_1_0_0_M20.xml new file mode 100644 index 000000000..8f89c372d --- /dev/null +++ b/.idea/libraries/Maven__org_apache_directory_api_api_asn1_api_1_0_0_M20.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_directory_api_api_util_1_0_0_M20.xml b/.idea/libraries/Maven__org_apache_directory_api_api_util_1_0_0_M20.xml new file mode 100644 index 000000000..52fa36de3 --- /dev/null +++ b/.idea/libraries/Maven__org_apache_directory_api_api_util_1_0_0_M20.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_directory_server_apacheds_i18n_2_0_0_M15.xml b/.idea/libraries/Maven__org_apache_directory_server_apacheds_i18n_2_0_0_M15.xml new file mode 100644 index 000000000..29cd91bcd --- /dev/null +++ b/.idea/libraries/Maven__org_apache_directory_server_apacheds_i18n_2_0_0_M15.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_directory_server_apacheds_kerberos_codec_2_0_0_M15.xml b/.idea/libraries/Maven__org_apache_directory_server_apacheds_kerberos_codec_2_0_0_M15.xml new file mode 100644 index 000000000..3734fcb38 --- /dev/null +++ b/.idea/libraries/Maven__org_apache_directory_server_apacheds_kerberos_codec_2_0_0_M15.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_hadoop_hadoop_annotations_2_5_0_cdh5_3_5.xml b/.idea/libraries/Maven__org_apache_hadoop_hadoop_annotations_2_5_0_cdh5_3_5.xml new file mode 100644 index 000000000..fd2eaea9b --- /dev/null +++ b/.idea/libraries/Maven__org_apache_hadoop_hadoop_annotations_2_5_0_cdh5_3_5.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_hadoop_hadoop_auth_2_5_0_cdh5_3_5.xml b/.idea/libraries/Maven__org_apache_hadoop_hadoop_auth_2_5_0_cdh5_3_5.xml new file mode 100644 index 000000000..b4fcfd764 --- /dev/null +++ b/.idea/libraries/Maven__org_apache_hadoop_hadoop_auth_2_5_0_cdh5_3_5.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_hadoop_hadoop_common_2_5_0_cdh5_3_5.xml b/.idea/libraries/Maven__org_apache_hadoop_hadoop_common_2_5_0_cdh5_3_5.xml new file mode 100644 index 000000000..2c822b77b --- /dev/null +++ b/.idea/libraries/Maven__org_apache_hadoop_hadoop_common_2_5_0_cdh5_3_5.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_hadoop_hadoop_core_2_5_0_mr1_cdh5_3_5.xml b/.idea/libraries/Maven__org_apache_hadoop_hadoop_core_2_5_0_mr1_cdh5_3_5.xml new file mode 100644 index 000000000..fa1c09fe6 --- /dev/null +++ b/.idea/libraries/Maven__org_apache_hadoop_hadoop_core_2_5_0_mr1_cdh5_3_5.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_hbase_hbase_client_0_98_6_cdh5_3_5.xml b/.idea/libraries/Maven__org_apache_hbase_hbase_client_0_98_6_cdh5_3_5.xml new file mode 100644 index 000000000..c1338c97e --- /dev/null +++ b/.idea/libraries/Maven__org_apache_hbase_hbase_client_0_98_6_cdh5_3_5.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_hbase_hbase_common_0_98_6_cdh5_3_5.xml b/.idea/libraries/Maven__org_apache_hbase_hbase_common_0_98_6_cdh5_3_5.xml new file mode 100644 index 000000000..d5474b74f --- /dev/null +++ b/.idea/libraries/Maven__org_apache_hbase_hbase_common_0_98_6_cdh5_3_5.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_hbase_hbase_protocol_0_98_6_cdh5_3_5.xml b/.idea/libraries/Maven__org_apache_hbase_hbase_protocol_0_98_6_cdh5_3_5.xml new file mode 100644 index 000000000..a353830d3 --- /dev/null +++ b/.idea/libraries/Maven__org_apache_hbase_hbase_protocol_0_98_6_cdh5_3_5.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_kafka_kafka_2_10_0_9_0_0.xml b/.idea/libraries/Maven__org_apache_kafka_kafka_2_10_0_9_0_0.xml new file mode 100644 index 000000000..9948ff7ca --- /dev/null +++ b/.idea/libraries/Maven__org_apache_kafka_kafka_2_10_0_9_0_0.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_kafka_kafka_clients_0_9_0_0.xml b/.idea/libraries/Maven__org_apache_kafka_kafka_clients_0_9_0_0.xml new file mode 100644 index 000000000..4396e1b8a --- /dev/null +++ b/.idea/libraries/Maven__org_apache_kafka_kafka_clients_0_9_0_0.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_zookeeper_zookeeper_3_4_5_cdh5_3_5.xml b/.idea/libraries/Maven__org_apache_zookeeper_zookeeper_3_4_5_cdh5_3_5.xml new file mode 100644 index 000000000..33f317a8c --- /dev/null +++ b/.idea/libraries/Maven__org_apache_zookeeper_zookeeper_3_4_5_cdh5_3_5.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_cloudera_htrace_htrace_core_2_04.xml b/.idea/libraries/Maven__org_cloudera_htrace_htrace_core_2_04.xml new file mode 100644 index 000000000..07046f2ef --- /dev/null +++ b/.idea/libraries/Maven__org_cloudera_htrace_htrace_core_2_04.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_eclipse_jetty_jetty_http_9_4_19_v20190610.xml b/.idea/libraries/Maven__org_eclipse_jetty_jetty_http_9_4_19_v20190610.xml new file mode 100644 index 000000000..de7725f6d --- /dev/null +++ b/.idea/libraries/Maven__org_eclipse_jetty_jetty_http_9_4_19_v20190610.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_eclipse_jetty_jetty_io_9_4_19_v20190610.xml b/.idea/libraries/Maven__org_eclipse_jetty_jetty_io_9_4_19_v20190610.xml new file mode 100644 index 000000000..2d6f875f0 --- /dev/null +++ b/.idea/libraries/Maven__org_eclipse_jetty_jetty_io_9_4_19_v20190610.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_eclipse_jetty_jetty_security_9_4_19_v20190610.xml b/.idea/libraries/Maven__org_eclipse_jetty_jetty_security_9_4_19_v20190610.xml new file mode 100644 index 000000000..4e93c3865 --- /dev/null +++ b/.idea/libraries/Maven__org_eclipse_jetty_jetty_security_9_4_19_v20190610.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_eclipse_jetty_jetty_server_9_4_19_v20190610.xml b/.idea/libraries/Maven__org_eclipse_jetty_jetty_server_9_4_19_v20190610.xml new file mode 100644 index 000000000..284f6e11d --- /dev/null +++ b/.idea/libraries/Maven__org_eclipse_jetty_jetty_server_9_4_19_v20190610.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_eclipse_jetty_jetty_servlet_9_4_19_v20190610.xml b/.idea/libraries/Maven__org_eclipse_jetty_jetty_servlet_9_4_19_v20190610.xml new file mode 100644 index 000000000..0453dd782 --- /dev/null +++ b/.idea/libraries/Maven__org_eclipse_jetty_jetty_servlet_9_4_19_v20190610.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_eclipse_jetty_jetty_util_9_4_19_v20190610.xml b/.idea/libraries/Maven__org_eclipse_jetty_jetty_util_9_4_19_v20190610.xml new file mode 100644 index 000000000..9ef563f3f --- /dev/null +++ b/.idea/libraries/Maven__org_eclipse_jetty_jetty_util_9_4_19_v20190610.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_glassfish_jaxb_jaxb_runtime_2_3_1.xml b/.idea/libraries/Maven__org_glassfish_jaxb_jaxb_runtime_2_3_1.xml new file mode 100644 index 000000000..130f99fe0 --- /dev/null +++ b/.idea/libraries/Maven__org_glassfish_jaxb_jaxb_runtime_2_3_1.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_glassfish_jaxb_txw2_2_3_1.xml b/.idea/libraries/Maven__org_glassfish_jaxb_txw2_2_3_1.xml new file mode 100644 index 000000000..30bca6953 --- /dev/null +++ b/.idea/libraries/Maven__org_glassfish_jaxb_txw2_2_3_1.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_hamcrest_hamcrest_core_1_1.xml b/.idea/libraries/Maven__org_hamcrest_hamcrest_core_1_1.xml new file mode 100644 index 000000000..e4c18dc38 --- /dev/null +++ b/.idea/libraries/Maven__org_hamcrest_hamcrest_core_1_1.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_jvnet_staxex_stax_ex_1_8.xml b/.idea/libraries/Maven__org_jvnet_staxex_stax_ex_1_8.xml new file mode 100644 index 000000000..9155b90ca --- /dev/null +++ b/.idea/libraries/Maven__org_jvnet_staxex_stax_ex_1_8.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_mortbay_jetty_jetty_6_1_26_cloudera_4.xml b/.idea/libraries/Maven__org_mortbay_jetty_jetty_6_1_26_cloudera_4.xml new file mode 100644 index 000000000..d983ad72a --- /dev/null +++ b/.idea/libraries/Maven__org_mortbay_jetty_jetty_6_1_26_cloudera_4.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_netpreserve_commons_webarchive_commons_1_1_8.xml b/.idea/libraries/Maven__org_netpreserve_commons_webarchive_commons_1_1_8.xml new file mode 100644 index 000000000..8b8be36c9 --- /dev/null +++ b/.idea/libraries/Maven__org_netpreserve_commons_webarchive_commons_1_1_8.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_restlet_jse_org_restlet_2_4_0.xml b/.idea/libraries/Maven__org_restlet_jse_org_restlet_2_4_0.xml new file mode 100644 index 000000000..dd9e47799 --- /dev/null +++ b/.idea/libraries/Maven__org_restlet_jse_org_restlet_2_4_0.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_restlet_jse_org_restlet_ext_crypto_2_4_0.xml b/.idea/libraries/Maven__org_restlet_jse_org_restlet_ext_crypto_2_4_0.xml new file mode 100644 index 000000000..ff8c21ea9 --- /dev/null +++ b/.idea/libraries/Maven__org_restlet_jse_org_restlet_ext_crypto_2_4_0.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_restlet_jse_org_restlet_ext_jetty_2_4_0.xml b/.idea/libraries/Maven__org_restlet_jse_org_restlet_ext_jetty_2_4_0.xml new file mode 100644 index 000000000..8c51d4463 --- /dev/null +++ b/.idea/libraries/Maven__org_restlet_jse_org_restlet_ext_jetty_2_4_0.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_restlet_jse_org_restlet_ext_xml_2_4_0.xml b/.idea/libraries/Maven__org_restlet_jse_org_restlet_ext_xml_2_4_0.xml new file mode 100644 index 000000000..fb34497cf --- /dev/null +++ b/.idea/libraries/Maven__org_restlet_jse_org_restlet_ext_xml_2_4_0.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_scala_lang_scala_library_2_10_5.xml b/.idea/libraries/Maven__org_scala_lang_scala_library_2_10_5.xml new file mode 100644 index 000000000..018d6c488 --- /dev/null +++ b/.idea/libraries/Maven__org_scala_lang_scala_library_2_10_5.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_slf4j_slf4j_api_1_7_12.xml b/.idea/libraries/Maven__org_slf4j_slf4j_api_1_7_12.xml new file mode 100644 index 000000000..4d0f846fb --- /dev/null +++ b/.idea/libraries/Maven__org_slf4j_slf4j_api_1_7_12.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_slf4j_slf4j_log4j12_1_7_6.xml b/.idea/libraries/Maven__org_slf4j_slf4j_log4j12_1_7_6.xml new file mode 100644 index 000000000..906e5c9f4 --- /dev/null +++ b/.idea/libraries/Maven__org_slf4j_slf4j_log4j12_1_7_6.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_xerial_snappy_snappy_java_1_1_1_7.xml b/.idea/libraries/Maven__org_xerial_snappy_snappy_java_1_1_1_7.xml new file mode 100644 index 000000000..0ed57668d --- /dev/null +++ b/.idea/libraries/Maven__org_xerial_snappy_snappy_java_1_1_1_7.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/dist/heritrix.iml b/dist/heritrix.iml deleted file mode 100644 index f1b8c3863..000000000 --- a/dist/heritrix.iml +++ /dev/null @@ -1,85 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file From 5e536862db47d6d3f4d28c853e56e09c646d42bb Mon Sep 17 00:00:00 2001 From: Colin Rosenthal Date: Fri, 15 Nov 2019 14:38:45 +0100 Subject: [PATCH 043/123] Removed mistaken additions --- .idea/libraries/Maven__com_101tec_zkclient_0_7.xml | 13 ------------- .../Maven__com_google_code_gson_gson_2_2_4.xml | 13 ------------- ...com_googlecode_json_simple_json_simple_1_1_1.xml | 13 ------------- .../Maven__com_rethinkdb_rethinkdb_driver_2_3_3.xml | 13 ------------- .idea/libraries/Maven__com_sleepycat_je_4_1_6.xml | 13 ------------- ..._com_sun_istack_istack_commons_runtime_3_0_7.xml | 13 ------------- ...__com_sun_xml_fastinfoset_FastInfoset_1_2_15.xml | 13 ------------- ..._javax_activation_javax_activation_api_1_2_0.xml | 13 ------------- ...Maven__javax_servlet_javax_servlet_api_3_1_0.xml | 13 ------------- .../Maven__javax_xml_bind_jaxb_api_2_3_1.xml | 13 ------------- .idea/libraries/Maven__junit_junit_4_10.xml | 13 ------------- .../Maven__org_apache_avro_avro_1_7_6_cdh5_3_5.xml | 13 ------------- ...ven__org_apache_curator_curator_client_2_6_0.xml | 13 ------------- ...__org_apache_curator_curator_framework_2_6_0.xml | 13 ------------- ...en__org_apache_curator_curator_recipes_2_6_0.xml | 13 ------------- ..._apache_directory_api_api_asn1_api_1_0_0_M20.xml | 13 ------------- ..._org_apache_directory_api_api_util_1_0_0_M20.xml | 13 ------------- ...che_directory_server_apacheds_i18n_2_0_0_M15.xml | 13 ------------- ...ory_server_apacheds_kerberos_codec_2_0_0_M15.xml | 13 ------------- ...che_hadoop_hadoop_annotations_2_5_0_cdh5_3_5.xml | 13 ------------- ...org_apache_hadoop_hadoop_auth_2_5_0_cdh5_3_5.xml | 13 ------------- ...g_apache_hadoop_hadoop_common_2_5_0_cdh5_3_5.xml | 13 ------------- ...apache_hadoop_hadoop_core_2_5_0_mr1_cdh5_3_5.xml | 13 ------------- ...rg_apache_hbase_hbase_client_0_98_6_cdh5_3_5.xml | 13 ------------- ...rg_apache_hbase_hbase_common_0_98_6_cdh5_3_5.xml | 13 ------------- ..._apache_hbase_hbase_protocol_0_98_6_cdh5_3_5.xml | 13 ------------- .../Maven__org_apache_kafka_kafka_2_10_0_9_0_0.xml | 13 ------------- ...aven__org_apache_kafka_kafka_clients_0_9_0_0.xml | 13 ------------- ...rg_apache_zookeeper_zookeeper_3_4_5_cdh5_3_5.xml | 13 ------------- .../Maven__org_cloudera_htrace_htrace_core_2_04.xml | 13 ------------- ...rg_eclipse_jetty_jetty_http_9_4_19_v20190610.xml | 13 ------------- ..._org_eclipse_jetty_jetty_io_9_4_19_v20190610.xml | 13 ------------- ...clipse_jetty_jetty_security_9_4_19_v20190610.xml | 13 ------------- ..._eclipse_jetty_jetty_server_9_4_19_v20190610.xml | 13 ------------- ...eclipse_jetty_jetty_servlet_9_4_19_v20190610.xml | 13 ------------- ...rg_eclipse_jetty_jetty_util_9_4_19_v20190610.xml | 13 ------------- ...Maven__org_glassfish_jaxb_jaxb_runtime_2_3_1.xml | 13 ------------- .../Maven__org_glassfish_jaxb_txw2_2_3_1.xml | 13 ------------- .../Maven__org_hamcrest_hamcrest_core_1_1.xml | 13 ------------- .../Maven__org_jvnet_staxex_stax_ex_1_8.xml | 13 ------------- ...n__org_mortbay_jetty_jetty_6_1_26_cloudera_4.xml | 13 ------------- ...netpreserve_commons_webarchive_commons_1_1_8.xml | 13 ------------- .../Maven__org_restlet_jse_org_restlet_2_4_0.xml | 13 ------------- ...org_restlet_jse_org_restlet_ext_crypto_2_4_0.xml | 13 ------------- ..._org_restlet_jse_org_restlet_ext_jetty_2_4_0.xml | 13 ------------- ...n__org_restlet_jse_org_restlet_ext_xml_2_4_0.xml | 13 ------------- .../Maven__org_scala_lang_scala_library_2_10_5.xml | 13 ------------- .../libraries/Maven__org_slf4j_slf4j_api_1_7_12.xml | 13 ------------- .../Maven__org_slf4j_slf4j_log4j12_1_7_6.xml | 13 ------------- ...Maven__org_xerial_snappy_snappy_java_1_1_1_7.xml | 13 ------------- 50 files changed, 650 deletions(-) delete mode 100644 .idea/libraries/Maven__com_101tec_zkclient_0_7.xml delete mode 100644 .idea/libraries/Maven__com_google_code_gson_gson_2_2_4.xml delete mode 100644 .idea/libraries/Maven__com_googlecode_json_simple_json_simple_1_1_1.xml delete mode 100644 .idea/libraries/Maven__com_rethinkdb_rethinkdb_driver_2_3_3.xml delete mode 100644 .idea/libraries/Maven__com_sleepycat_je_4_1_6.xml delete mode 100644 .idea/libraries/Maven__com_sun_istack_istack_commons_runtime_3_0_7.xml delete mode 100644 .idea/libraries/Maven__com_sun_xml_fastinfoset_FastInfoset_1_2_15.xml delete mode 100644 .idea/libraries/Maven__javax_activation_javax_activation_api_1_2_0.xml delete mode 100644 .idea/libraries/Maven__javax_servlet_javax_servlet_api_3_1_0.xml delete mode 100644 .idea/libraries/Maven__javax_xml_bind_jaxb_api_2_3_1.xml delete mode 100644 .idea/libraries/Maven__junit_junit_4_10.xml delete mode 100644 .idea/libraries/Maven__org_apache_avro_avro_1_7_6_cdh5_3_5.xml delete mode 100644 .idea/libraries/Maven__org_apache_curator_curator_client_2_6_0.xml delete mode 100644 .idea/libraries/Maven__org_apache_curator_curator_framework_2_6_0.xml delete mode 100644 .idea/libraries/Maven__org_apache_curator_curator_recipes_2_6_0.xml delete mode 100644 .idea/libraries/Maven__org_apache_directory_api_api_asn1_api_1_0_0_M20.xml delete mode 100644 .idea/libraries/Maven__org_apache_directory_api_api_util_1_0_0_M20.xml delete mode 100644 .idea/libraries/Maven__org_apache_directory_server_apacheds_i18n_2_0_0_M15.xml delete mode 100644 .idea/libraries/Maven__org_apache_directory_server_apacheds_kerberos_codec_2_0_0_M15.xml delete mode 100644 .idea/libraries/Maven__org_apache_hadoop_hadoop_annotations_2_5_0_cdh5_3_5.xml delete mode 100644 .idea/libraries/Maven__org_apache_hadoop_hadoop_auth_2_5_0_cdh5_3_5.xml delete mode 100644 .idea/libraries/Maven__org_apache_hadoop_hadoop_common_2_5_0_cdh5_3_5.xml delete mode 100644 .idea/libraries/Maven__org_apache_hadoop_hadoop_core_2_5_0_mr1_cdh5_3_5.xml delete mode 100644 .idea/libraries/Maven__org_apache_hbase_hbase_client_0_98_6_cdh5_3_5.xml delete mode 100644 .idea/libraries/Maven__org_apache_hbase_hbase_common_0_98_6_cdh5_3_5.xml delete mode 100644 .idea/libraries/Maven__org_apache_hbase_hbase_protocol_0_98_6_cdh5_3_5.xml delete mode 100644 .idea/libraries/Maven__org_apache_kafka_kafka_2_10_0_9_0_0.xml delete mode 100644 .idea/libraries/Maven__org_apache_kafka_kafka_clients_0_9_0_0.xml delete mode 100644 .idea/libraries/Maven__org_apache_zookeeper_zookeeper_3_4_5_cdh5_3_5.xml delete mode 100644 .idea/libraries/Maven__org_cloudera_htrace_htrace_core_2_04.xml delete mode 100644 .idea/libraries/Maven__org_eclipse_jetty_jetty_http_9_4_19_v20190610.xml delete mode 100644 .idea/libraries/Maven__org_eclipse_jetty_jetty_io_9_4_19_v20190610.xml delete mode 100644 .idea/libraries/Maven__org_eclipse_jetty_jetty_security_9_4_19_v20190610.xml delete mode 100644 .idea/libraries/Maven__org_eclipse_jetty_jetty_server_9_4_19_v20190610.xml delete mode 100644 .idea/libraries/Maven__org_eclipse_jetty_jetty_servlet_9_4_19_v20190610.xml delete mode 100644 .idea/libraries/Maven__org_eclipse_jetty_jetty_util_9_4_19_v20190610.xml delete mode 100644 .idea/libraries/Maven__org_glassfish_jaxb_jaxb_runtime_2_3_1.xml delete mode 100644 .idea/libraries/Maven__org_glassfish_jaxb_txw2_2_3_1.xml delete mode 100644 .idea/libraries/Maven__org_hamcrest_hamcrest_core_1_1.xml delete mode 100644 .idea/libraries/Maven__org_jvnet_staxex_stax_ex_1_8.xml delete mode 100644 .idea/libraries/Maven__org_mortbay_jetty_jetty_6_1_26_cloudera_4.xml delete mode 100644 .idea/libraries/Maven__org_netpreserve_commons_webarchive_commons_1_1_8.xml delete mode 100644 .idea/libraries/Maven__org_restlet_jse_org_restlet_2_4_0.xml delete mode 100644 .idea/libraries/Maven__org_restlet_jse_org_restlet_ext_crypto_2_4_0.xml delete mode 100644 .idea/libraries/Maven__org_restlet_jse_org_restlet_ext_jetty_2_4_0.xml delete mode 100644 .idea/libraries/Maven__org_restlet_jse_org_restlet_ext_xml_2_4_0.xml delete mode 100644 .idea/libraries/Maven__org_scala_lang_scala_library_2_10_5.xml delete mode 100644 .idea/libraries/Maven__org_slf4j_slf4j_api_1_7_12.xml delete mode 100644 .idea/libraries/Maven__org_slf4j_slf4j_log4j12_1_7_6.xml delete mode 100644 .idea/libraries/Maven__org_xerial_snappy_snappy_java_1_1_1_7.xml diff --git a/.idea/libraries/Maven__com_101tec_zkclient_0_7.xml b/.idea/libraries/Maven__com_101tec_zkclient_0_7.xml deleted file mode 100644 index a23acb560..000000000 --- a/.idea/libraries/Maven__com_101tec_zkclient_0_7.xml +++ /dev/null @@ -1,13 +0,0 @@ - - - - - - - - - - - - - \ No newline at end of file diff --git a/.idea/libraries/Maven__com_google_code_gson_gson_2_2_4.xml b/.idea/libraries/Maven__com_google_code_gson_gson_2_2_4.xml deleted file mode 100644 index c91e18862..000000000 --- a/.idea/libraries/Maven__com_google_code_gson_gson_2_2_4.xml +++ /dev/null @@ -1,13 +0,0 @@ - - - - - - - - - - - - - \ No newline at end of file diff --git a/.idea/libraries/Maven__com_googlecode_json_simple_json_simple_1_1_1.xml b/.idea/libraries/Maven__com_googlecode_json_simple_json_simple_1_1_1.xml deleted file mode 100644 index a99f3a2eb..000000000 --- a/.idea/libraries/Maven__com_googlecode_json_simple_json_simple_1_1_1.xml +++ /dev/null @@ -1,13 +0,0 @@ - - - - - - - - - - - - - \ No newline at end of file diff --git a/.idea/libraries/Maven__com_rethinkdb_rethinkdb_driver_2_3_3.xml b/.idea/libraries/Maven__com_rethinkdb_rethinkdb_driver_2_3_3.xml deleted file mode 100644 index 6ed629942..000000000 --- a/.idea/libraries/Maven__com_rethinkdb_rethinkdb_driver_2_3_3.xml +++ /dev/null @@ -1,13 +0,0 @@ - - - - - - - - - - - - - \ No newline at end of file diff --git a/.idea/libraries/Maven__com_sleepycat_je_4_1_6.xml b/.idea/libraries/Maven__com_sleepycat_je_4_1_6.xml deleted file mode 100644 index 22f18996a..000000000 --- a/.idea/libraries/Maven__com_sleepycat_je_4_1_6.xml +++ /dev/null @@ -1,13 +0,0 @@ - - - - - - - - - - - - - \ No newline at end of file diff --git a/.idea/libraries/Maven__com_sun_istack_istack_commons_runtime_3_0_7.xml b/.idea/libraries/Maven__com_sun_istack_istack_commons_runtime_3_0_7.xml deleted file mode 100644 index a9a7ec3dc..000000000 --- a/.idea/libraries/Maven__com_sun_istack_istack_commons_runtime_3_0_7.xml +++ /dev/null @@ -1,13 +0,0 @@ - - - - - - - - - - - - - \ No newline at end of file diff --git a/.idea/libraries/Maven__com_sun_xml_fastinfoset_FastInfoset_1_2_15.xml b/.idea/libraries/Maven__com_sun_xml_fastinfoset_FastInfoset_1_2_15.xml deleted file mode 100644 index ec6faea39..000000000 --- a/.idea/libraries/Maven__com_sun_xml_fastinfoset_FastInfoset_1_2_15.xml +++ /dev/null @@ -1,13 +0,0 @@ - - - - - - - - - - - - - \ No newline at end of file diff --git a/.idea/libraries/Maven__javax_activation_javax_activation_api_1_2_0.xml b/.idea/libraries/Maven__javax_activation_javax_activation_api_1_2_0.xml deleted file mode 100644 index 5b2b25be0..000000000 --- a/.idea/libraries/Maven__javax_activation_javax_activation_api_1_2_0.xml +++ /dev/null @@ -1,13 +0,0 @@ - - - - - - - - - - - - - \ No newline at end of file diff --git a/.idea/libraries/Maven__javax_servlet_javax_servlet_api_3_1_0.xml b/.idea/libraries/Maven__javax_servlet_javax_servlet_api_3_1_0.xml deleted file mode 100644 index 40d82ec8d..000000000 --- a/.idea/libraries/Maven__javax_servlet_javax_servlet_api_3_1_0.xml +++ /dev/null @@ -1,13 +0,0 @@ - - - - - - - - - - - - - \ No newline at end of file diff --git a/.idea/libraries/Maven__javax_xml_bind_jaxb_api_2_3_1.xml b/.idea/libraries/Maven__javax_xml_bind_jaxb_api_2_3_1.xml deleted file mode 100644 index 870581996..000000000 --- a/.idea/libraries/Maven__javax_xml_bind_jaxb_api_2_3_1.xml +++ /dev/null @@ -1,13 +0,0 @@ - - - - - - - - - - - - - \ No newline at end of file diff --git a/.idea/libraries/Maven__junit_junit_4_10.xml b/.idea/libraries/Maven__junit_junit_4_10.xml deleted file mode 100644 index f126b9cb5..000000000 --- a/.idea/libraries/Maven__junit_junit_4_10.xml +++ /dev/null @@ -1,13 +0,0 @@ - - - - - - - - - - - - - \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_avro_avro_1_7_6_cdh5_3_5.xml b/.idea/libraries/Maven__org_apache_avro_avro_1_7_6_cdh5_3_5.xml deleted file mode 100644 index 186913dc3..000000000 --- a/.idea/libraries/Maven__org_apache_avro_avro_1_7_6_cdh5_3_5.xml +++ /dev/null @@ -1,13 +0,0 @@ - - - - - - - - - - - - - \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_curator_curator_client_2_6_0.xml b/.idea/libraries/Maven__org_apache_curator_curator_client_2_6_0.xml deleted file mode 100644 index 6a284b7d8..000000000 --- a/.idea/libraries/Maven__org_apache_curator_curator_client_2_6_0.xml +++ /dev/null @@ -1,13 +0,0 @@ - - - - - - - - - - - - - \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_curator_curator_framework_2_6_0.xml b/.idea/libraries/Maven__org_apache_curator_curator_framework_2_6_0.xml deleted file mode 100644 index 7c1622082..000000000 --- a/.idea/libraries/Maven__org_apache_curator_curator_framework_2_6_0.xml +++ /dev/null @@ -1,13 +0,0 @@ - - - - - - - - - - - - - \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_curator_curator_recipes_2_6_0.xml b/.idea/libraries/Maven__org_apache_curator_curator_recipes_2_6_0.xml deleted file mode 100644 index 35631fe16..000000000 --- a/.idea/libraries/Maven__org_apache_curator_curator_recipes_2_6_0.xml +++ /dev/null @@ -1,13 +0,0 @@ - - - - - - - - - - - - - \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_directory_api_api_asn1_api_1_0_0_M20.xml b/.idea/libraries/Maven__org_apache_directory_api_api_asn1_api_1_0_0_M20.xml deleted file mode 100644 index 8f89c372d..000000000 --- a/.idea/libraries/Maven__org_apache_directory_api_api_asn1_api_1_0_0_M20.xml +++ /dev/null @@ -1,13 +0,0 @@ - - - - - - - - - - - - - \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_directory_api_api_util_1_0_0_M20.xml b/.idea/libraries/Maven__org_apache_directory_api_api_util_1_0_0_M20.xml deleted file mode 100644 index 52fa36de3..000000000 --- a/.idea/libraries/Maven__org_apache_directory_api_api_util_1_0_0_M20.xml +++ /dev/null @@ -1,13 +0,0 @@ - - - - - - - - - - - - - \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_directory_server_apacheds_i18n_2_0_0_M15.xml b/.idea/libraries/Maven__org_apache_directory_server_apacheds_i18n_2_0_0_M15.xml deleted file mode 100644 index 29cd91bcd..000000000 --- a/.idea/libraries/Maven__org_apache_directory_server_apacheds_i18n_2_0_0_M15.xml +++ /dev/null @@ -1,13 +0,0 @@ - - - - - - - - - - - - - \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_directory_server_apacheds_kerberos_codec_2_0_0_M15.xml b/.idea/libraries/Maven__org_apache_directory_server_apacheds_kerberos_codec_2_0_0_M15.xml deleted file mode 100644 index 3734fcb38..000000000 --- a/.idea/libraries/Maven__org_apache_directory_server_apacheds_kerberos_codec_2_0_0_M15.xml +++ /dev/null @@ -1,13 +0,0 @@ - - - - - - - - - - - - - \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_hadoop_hadoop_annotations_2_5_0_cdh5_3_5.xml b/.idea/libraries/Maven__org_apache_hadoop_hadoop_annotations_2_5_0_cdh5_3_5.xml deleted file mode 100644 index fd2eaea9b..000000000 --- a/.idea/libraries/Maven__org_apache_hadoop_hadoop_annotations_2_5_0_cdh5_3_5.xml +++ /dev/null @@ -1,13 +0,0 @@ - - - - - - - - - - - - - \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_hadoop_hadoop_auth_2_5_0_cdh5_3_5.xml b/.idea/libraries/Maven__org_apache_hadoop_hadoop_auth_2_5_0_cdh5_3_5.xml deleted file mode 100644 index b4fcfd764..000000000 --- a/.idea/libraries/Maven__org_apache_hadoop_hadoop_auth_2_5_0_cdh5_3_5.xml +++ /dev/null @@ -1,13 +0,0 @@ - - - - - - - - - - - - - \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_hadoop_hadoop_common_2_5_0_cdh5_3_5.xml b/.idea/libraries/Maven__org_apache_hadoop_hadoop_common_2_5_0_cdh5_3_5.xml deleted file mode 100644 index 2c822b77b..000000000 --- a/.idea/libraries/Maven__org_apache_hadoop_hadoop_common_2_5_0_cdh5_3_5.xml +++ /dev/null @@ -1,13 +0,0 @@ - - - - - - - - - - - - - \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_hadoop_hadoop_core_2_5_0_mr1_cdh5_3_5.xml b/.idea/libraries/Maven__org_apache_hadoop_hadoop_core_2_5_0_mr1_cdh5_3_5.xml deleted file mode 100644 index fa1c09fe6..000000000 --- a/.idea/libraries/Maven__org_apache_hadoop_hadoop_core_2_5_0_mr1_cdh5_3_5.xml +++ /dev/null @@ -1,13 +0,0 @@ - - - - - - - - - - - - - \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_hbase_hbase_client_0_98_6_cdh5_3_5.xml b/.idea/libraries/Maven__org_apache_hbase_hbase_client_0_98_6_cdh5_3_5.xml deleted file mode 100644 index c1338c97e..000000000 --- a/.idea/libraries/Maven__org_apache_hbase_hbase_client_0_98_6_cdh5_3_5.xml +++ /dev/null @@ -1,13 +0,0 @@ - - - - - - - - - - - - - \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_hbase_hbase_common_0_98_6_cdh5_3_5.xml b/.idea/libraries/Maven__org_apache_hbase_hbase_common_0_98_6_cdh5_3_5.xml deleted file mode 100644 index d5474b74f..000000000 --- a/.idea/libraries/Maven__org_apache_hbase_hbase_common_0_98_6_cdh5_3_5.xml +++ /dev/null @@ -1,13 +0,0 @@ - - - - - - - - - - - - - \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_hbase_hbase_protocol_0_98_6_cdh5_3_5.xml b/.idea/libraries/Maven__org_apache_hbase_hbase_protocol_0_98_6_cdh5_3_5.xml deleted file mode 100644 index a353830d3..000000000 --- a/.idea/libraries/Maven__org_apache_hbase_hbase_protocol_0_98_6_cdh5_3_5.xml +++ /dev/null @@ -1,13 +0,0 @@ - - - - - - - - - - - - - \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_kafka_kafka_2_10_0_9_0_0.xml b/.idea/libraries/Maven__org_apache_kafka_kafka_2_10_0_9_0_0.xml deleted file mode 100644 index 9948ff7ca..000000000 --- a/.idea/libraries/Maven__org_apache_kafka_kafka_2_10_0_9_0_0.xml +++ /dev/null @@ -1,13 +0,0 @@ - - - - - - - - - - - - - \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_kafka_kafka_clients_0_9_0_0.xml b/.idea/libraries/Maven__org_apache_kafka_kafka_clients_0_9_0_0.xml deleted file mode 100644 index 4396e1b8a..000000000 --- a/.idea/libraries/Maven__org_apache_kafka_kafka_clients_0_9_0_0.xml +++ /dev/null @@ -1,13 +0,0 @@ - - - - - - - - - - - - - \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_zookeeper_zookeeper_3_4_5_cdh5_3_5.xml b/.idea/libraries/Maven__org_apache_zookeeper_zookeeper_3_4_5_cdh5_3_5.xml deleted file mode 100644 index 33f317a8c..000000000 --- a/.idea/libraries/Maven__org_apache_zookeeper_zookeeper_3_4_5_cdh5_3_5.xml +++ /dev/null @@ -1,13 +0,0 @@ - - - - - - - - - - - - - \ No newline at end of file diff --git a/.idea/libraries/Maven__org_cloudera_htrace_htrace_core_2_04.xml b/.idea/libraries/Maven__org_cloudera_htrace_htrace_core_2_04.xml deleted file mode 100644 index 07046f2ef..000000000 --- a/.idea/libraries/Maven__org_cloudera_htrace_htrace_core_2_04.xml +++ /dev/null @@ -1,13 +0,0 @@ - - - - - - - - - - - - - \ No newline at end of file diff --git a/.idea/libraries/Maven__org_eclipse_jetty_jetty_http_9_4_19_v20190610.xml b/.idea/libraries/Maven__org_eclipse_jetty_jetty_http_9_4_19_v20190610.xml deleted file mode 100644 index de7725f6d..000000000 --- a/.idea/libraries/Maven__org_eclipse_jetty_jetty_http_9_4_19_v20190610.xml +++ /dev/null @@ -1,13 +0,0 @@ - - - - - - - - - - - - - \ No newline at end of file diff --git a/.idea/libraries/Maven__org_eclipse_jetty_jetty_io_9_4_19_v20190610.xml b/.idea/libraries/Maven__org_eclipse_jetty_jetty_io_9_4_19_v20190610.xml deleted file mode 100644 index 2d6f875f0..000000000 --- a/.idea/libraries/Maven__org_eclipse_jetty_jetty_io_9_4_19_v20190610.xml +++ /dev/null @@ -1,13 +0,0 @@ - - - - - - - - - - - - - \ No newline at end of file diff --git a/.idea/libraries/Maven__org_eclipse_jetty_jetty_security_9_4_19_v20190610.xml b/.idea/libraries/Maven__org_eclipse_jetty_jetty_security_9_4_19_v20190610.xml deleted file mode 100644 index 4e93c3865..000000000 --- a/.idea/libraries/Maven__org_eclipse_jetty_jetty_security_9_4_19_v20190610.xml +++ /dev/null @@ -1,13 +0,0 @@ - - - - - - - - - - - - - \ No newline at end of file diff --git a/.idea/libraries/Maven__org_eclipse_jetty_jetty_server_9_4_19_v20190610.xml b/.idea/libraries/Maven__org_eclipse_jetty_jetty_server_9_4_19_v20190610.xml deleted file mode 100644 index 284f6e11d..000000000 --- a/.idea/libraries/Maven__org_eclipse_jetty_jetty_server_9_4_19_v20190610.xml +++ /dev/null @@ -1,13 +0,0 @@ - - - - - - - - - - - - - \ No newline at end of file diff --git a/.idea/libraries/Maven__org_eclipse_jetty_jetty_servlet_9_4_19_v20190610.xml b/.idea/libraries/Maven__org_eclipse_jetty_jetty_servlet_9_4_19_v20190610.xml deleted file mode 100644 index 0453dd782..000000000 --- a/.idea/libraries/Maven__org_eclipse_jetty_jetty_servlet_9_4_19_v20190610.xml +++ /dev/null @@ -1,13 +0,0 @@ - - - - - - - - - - - - - \ No newline at end of file diff --git a/.idea/libraries/Maven__org_eclipse_jetty_jetty_util_9_4_19_v20190610.xml b/.idea/libraries/Maven__org_eclipse_jetty_jetty_util_9_4_19_v20190610.xml deleted file mode 100644 index 9ef563f3f..000000000 --- a/.idea/libraries/Maven__org_eclipse_jetty_jetty_util_9_4_19_v20190610.xml +++ /dev/null @@ -1,13 +0,0 @@ - - - - - - - - - - - - - \ No newline at end of file diff --git a/.idea/libraries/Maven__org_glassfish_jaxb_jaxb_runtime_2_3_1.xml b/.idea/libraries/Maven__org_glassfish_jaxb_jaxb_runtime_2_3_1.xml deleted file mode 100644 index 130f99fe0..000000000 --- a/.idea/libraries/Maven__org_glassfish_jaxb_jaxb_runtime_2_3_1.xml +++ /dev/null @@ -1,13 +0,0 @@ - - - - - - - - - - - - - \ No newline at end of file diff --git a/.idea/libraries/Maven__org_glassfish_jaxb_txw2_2_3_1.xml b/.idea/libraries/Maven__org_glassfish_jaxb_txw2_2_3_1.xml deleted file mode 100644 index 30bca6953..000000000 --- a/.idea/libraries/Maven__org_glassfish_jaxb_txw2_2_3_1.xml +++ /dev/null @@ -1,13 +0,0 @@ - - - - - - - - - - - - - \ No newline at end of file diff --git a/.idea/libraries/Maven__org_hamcrest_hamcrest_core_1_1.xml b/.idea/libraries/Maven__org_hamcrest_hamcrest_core_1_1.xml deleted file mode 100644 index e4c18dc38..000000000 --- a/.idea/libraries/Maven__org_hamcrest_hamcrest_core_1_1.xml +++ /dev/null @@ -1,13 +0,0 @@ - - - - - - - - - - - - - \ No newline at end of file diff --git a/.idea/libraries/Maven__org_jvnet_staxex_stax_ex_1_8.xml b/.idea/libraries/Maven__org_jvnet_staxex_stax_ex_1_8.xml deleted file mode 100644 index 9155b90ca..000000000 --- a/.idea/libraries/Maven__org_jvnet_staxex_stax_ex_1_8.xml +++ /dev/null @@ -1,13 +0,0 @@ - - - - - - - - - - - - - \ No newline at end of file diff --git a/.idea/libraries/Maven__org_mortbay_jetty_jetty_6_1_26_cloudera_4.xml b/.idea/libraries/Maven__org_mortbay_jetty_jetty_6_1_26_cloudera_4.xml deleted file mode 100644 index d983ad72a..000000000 --- a/.idea/libraries/Maven__org_mortbay_jetty_jetty_6_1_26_cloudera_4.xml +++ /dev/null @@ -1,13 +0,0 @@ - - - - - - - - - - - - - \ No newline at end of file diff --git a/.idea/libraries/Maven__org_netpreserve_commons_webarchive_commons_1_1_8.xml b/.idea/libraries/Maven__org_netpreserve_commons_webarchive_commons_1_1_8.xml deleted file mode 100644 index 8b8be36c9..000000000 --- a/.idea/libraries/Maven__org_netpreserve_commons_webarchive_commons_1_1_8.xml +++ /dev/null @@ -1,13 +0,0 @@ - - - - - - - - - - - - - \ No newline at end of file diff --git a/.idea/libraries/Maven__org_restlet_jse_org_restlet_2_4_0.xml b/.idea/libraries/Maven__org_restlet_jse_org_restlet_2_4_0.xml deleted file mode 100644 index dd9e47799..000000000 --- a/.idea/libraries/Maven__org_restlet_jse_org_restlet_2_4_0.xml +++ /dev/null @@ -1,13 +0,0 @@ - - - - - - - - - - - - - \ No newline at end of file diff --git a/.idea/libraries/Maven__org_restlet_jse_org_restlet_ext_crypto_2_4_0.xml b/.idea/libraries/Maven__org_restlet_jse_org_restlet_ext_crypto_2_4_0.xml deleted file mode 100644 index ff8c21ea9..000000000 --- a/.idea/libraries/Maven__org_restlet_jse_org_restlet_ext_crypto_2_4_0.xml +++ /dev/null @@ -1,13 +0,0 @@ - - - - - - - - - - - - - \ No newline at end of file diff --git a/.idea/libraries/Maven__org_restlet_jse_org_restlet_ext_jetty_2_4_0.xml b/.idea/libraries/Maven__org_restlet_jse_org_restlet_ext_jetty_2_4_0.xml deleted file mode 100644 index 8c51d4463..000000000 --- a/.idea/libraries/Maven__org_restlet_jse_org_restlet_ext_jetty_2_4_0.xml +++ /dev/null @@ -1,13 +0,0 @@ - - - - - - - - - - - - - \ No newline at end of file diff --git a/.idea/libraries/Maven__org_restlet_jse_org_restlet_ext_xml_2_4_0.xml b/.idea/libraries/Maven__org_restlet_jse_org_restlet_ext_xml_2_4_0.xml deleted file mode 100644 index fb34497cf..000000000 --- a/.idea/libraries/Maven__org_restlet_jse_org_restlet_ext_xml_2_4_0.xml +++ /dev/null @@ -1,13 +0,0 @@ - - - - - - - - - - - - - \ No newline at end of file diff --git a/.idea/libraries/Maven__org_scala_lang_scala_library_2_10_5.xml b/.idea/libraries/Maven__org_scala_lang_scala_library_2_10_5.xml deleted file mode 100644 index 018d6c488..000000000 --- a/.idea/libraries/Maven__org_scala_lang_scala_library_2_10_5.xml +++ /dev/null @@ -1,13 +0,0 @@ - - - - - - - - - - - - - \ No newline at end of file diff --git a/.idea/libraries/Maven__org_slf4j_slf4j_api_1_7_12.xml b/.idea/libraries/Maven__org_slf4j_slf4j_api_1_7_12.xml deleted file mode 100644 index 4d0f846fb..000000000 --- a/.idea/libraries/Maven__org_slf4j_slf4j_api_1_7_12.xml +++ /dev/null @@ -1,13 +0,0 @@ - - - - - - - - - - - - - \ No newline at end of file diff --git a/.idea/libraries/Maven__org_slf4j_slf4j_log4j12_1_7_6.xml b/.idea/libraries/Maven__org_slf4j_slf4j_log4j12_1_7_6.xml deleted file mode 100644 index 906e5c9f4..000000000 --- a/.idea/libraries/Maven__org_slf4j_slf4j_log4j12_1_7_6.xml +++ /dev/null @@ -1,13 +0,0 @@ - - - - - - - - - - - - - \ No newline at end of file diff --git a/.idea/libraries/Maven__org_xerial_snappy_snappy_java_1_1_1_7.xml b/.idea/libraries/Maven__org_xerial_snappy_snappy_java_1_1_1_7.xml deleted file mode 100644 index 0ed57668d..000000000 --- a/.idea/libraries/Maven__org_xerial_snappy_snappy_java_1_1_1_7.xml +++ /dev/null @@ -1,13 +0,0 @@ - - - - - - - - - - - - - \ No newline at end of file From 4983e6592931dc18f822fa3ce18126a5a789b71a Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 15 Nov 2019 15:42:59 -0800 Subject: [PATCH 044/123] fix non-playlist case (oops!) --- .../modules/extractor/ExtractorYoutubeDL.java | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/contrib/src/main/java/org/archive/modules/extractor/ExtractorYoutubeDL.java b/contrib/src/main/java/org/archive/modules/extractor/ExtractorYoutubeDL.java index 723d914c3..5acdd3890 100644 --- a/contrib/src/main/java/org/archive/modules/extractor/ExtractorYoutubeDL.java +++ b/contrib/src/main/java/org/archive/modules/extractor/ExtractorYoutubeDL.java @@ -27,6 +27,7 @@ import java.io.Reader; import java.io.StringWriter; import java.net.URI; +import java.util.Arrays; import java.util.concurrent.Callable; import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutorService; @@ -50,7 +51,6 @@ import org.springframework.beans.factory.annotation.Autowired; import org.springframework.context.Lifecycle; -import com.google.gson.JsonArray; import com.google.gson.JsonElement; import com.google.gson.JsonObject; import com.google.gson.JsonParseException; @@ -168,9 +168,17 @@ protected void extract(CrawlURI uri) { } } else { JsonObject ydlJson = runYoutubeDL(uri); - if (ydlJson != null && ydlJson.has("entries")) { - JsonArray jsonEntries = ydlJson.getAsJsonArray("entries"); + if (ydlJson != null && (ydlJson.has("entries") || ydlJson.has("url"))) { + Iterable jsonEntries; + if (ydlJson.has("entries")) { + jsonEntries = ydlJson.getAsJsonArray("entries"); + } else { + jsonEntries = Arrays.asList(ydlJson); + } + + int count = 0; for (JsonElement e: jsonEntries) { + count += 1; JsonObject json = (JsonObject) e; if (json.get("url") != null) { String videoUrl = json.get("url").getAsString(); @@ -181,7 +189,7 @@ protected void extract(CrawlURI uri) { // XXX this can be large, consider using a RecordingOutputStream uri.getData().put("ydlJson", ydlJson); - String annotation = "youtube-dl:" + jsonEntries.size(); + String annotation = "youtube-dl:" + count; uri.getAnnotations().add(annotation); logContainingPage(uri, annotation); } @@ -379,7 +387,7 @@ protected JsonObject runYoutubeDL(CrawlURI uri) { JsonObject ydlJson = null; try { if (parser.hasNext()) { - ydlJson = (JsonObject) parser.next(); + ydlJson = (JsonObject) parser.next(); } } catch (JsonParseException e) { // sometimes we get no output at all from youtube-dl, which From c4b08166f27b12daea179129918b358ff596ba57 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 15 Nov 2019 15:52:36 -0800 Subject: [PATCH 045/123] extract watch page links from youtube playlists and equivalent for other sites. Usually we find these links through normal link extraction, but we have the info here, so we may as well use it to make sure. --- .../modules/extractor/ExtractorYoutubeDL.java | 26 +++++++++++++++---- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/contrib/src/main/java/org/archive/modules/extractor/ExtractorYoutubeDL.java b/contrib/src/main/java/org/archive/modules/extractor/ExtractorYoutubeDL.java index 5acdd3890..341a8ce6a 100644 --- a/contrib/src/main/java/org/archive/modules/extractor/ExtractorYoutubeDL.java +++ b/contrib/src/main/java/org/archive/modules/extractor/ExtractorYoutubeDL.java @@ -177,12 +177,28 @@ protected void extract(CrawlURI uri) { } int count = 0; - for (JsonElement e: jsonEntries) { + for (JsonElement jsonE: jsonEntries) { count += 1; - JsonObject json = (JsonObject) e; - if (json.get("url") != null) { - String videoUrl = json.get("url").getAsString(); - addVideoOutlink(uri, json, videoUrl); + JsonObject jsonO = (JsonObject) jsonE; + + // media url + if (jsonO.get("url") != null) { + String videoUrl = jsonO.get("url").getAsString(); + addVideoOutlink(uri, jsonO, videoUrl); + } + + // make sure we extract watch page links from youtube playlists, + // and equivalent for other sites + if (jsonO.get("webpage_url") != null) { + String webpageUrl = jsonO.get("webpage_url").getAsString(); + try { + UURI dest = UURIFactory.getInstance(uri.getUURI(), webpageUrl); + CrawlURI link = uri.createCrawlURI(dest, LinkContext.NAVLINK_MISC, + Hop.NAVLINK); + uri.getOutLinks().add(link); + } catch (URIException e1) { + logUriError(e1, uri.getUURI(), webpageUrl); + } } } From 6e7bc54c9d531a18618a5b2d6f912c09497674fd Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 15 Nov 2019 16:14:17 -0800 Subject: [PATCH 046/123] use org.json like everybody else --- contrib/pom.xml | 5 -- .../modules/extractor/ExtractorYoutubeDL.java | 61 +++++++------------ 2 files changed, 23 insertions(+), 43 deletions(-) diff --git a/contrib/pom.xml b/contrib/pom.xml index 5a1e1ba62..fa1877c67 100644 --- a/contrib/pom.xml +++ b/contrib/pom.xml @@ -66,11 +66,6 @@ rethinkdb-driver 2.3.3 - - com.googlecode.json-simple - json-simple - 1.1.1 - diff --git a/contrib/src/main/java/org/archive/modules/extractor/ExtractorYoutubeDL.java b/contrib/src/main/java/org/archive/modules/extractor/ExtractorYoutubeDL.java index 341a8ce6a..0fc0d070d 100644 --- a/contrib/src/main/java/org/archive/modules/extractor/ExtractorYoutubeDL.java +++ b/contrib/src/main/java/org/archive/modules/extractor/ExtractorYoutubeDL.java @@ -25,7 +25,6 @@ import java.io.IOException; import java.io.InputStreamReader; import java.io.Reader; -import java.io.StringWriter; import java.net.URI; import java.util.Arrays; import java.util.concurrent.Callable; @@ -48,16 +47,12 @@ import org.archive.net.UURIFactory; import org.archive.util.ArchiveUtils; import org.archive.util.MimetypeUtils; +import org.json.JSONArray; +import org.json.JSONException; +import org.json.JSONObject; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.context.Lifecycle; -import com.google.gson.JsonElement; -import com.google.gson.JsonObject; -import com.google.gson.JsonParseException; -import com.google.gson.JsonStreamParser; -import com.google.gson.internal.Streams; -import com.google.gson.stream.JsonWriter; - /** * Extracts links to media by running youtube-dl in a subprocess. Runs only on * html. @@ -167,30 +162,28 @@ protected void extract(CrawlURI uri) { logCapturedVideo(uri, ydlAnnotation); } } else { - JsonObject ydlJson = runYoutubeDL(uri); + JSONObject ydlJson = runYoutubeDL(uri); if (ydlJson != null && (ydlJson.has("entries") || ydlJson.has("url"))) { - Iterable jsonEntries; + JSONArray jsonEntries; if (ydlJson.has("entries")) { - jsonEntries = ydlJson.getAsJsonArray("entries"); + jsonEntries = ydlJson.getJSONArray("entries"); } else { - jsonEntries = Arrays.asList(ydlJson); + jsonEntries = new JSONArray(Arrays.asList(ydlJson)); } - int count = 0; - for (JsonElement jsonE: jsonEntries) { - count += 1; - JsonObject jsonO = (JsonObject) jsonE; + for (int i = 0; i < jsonEntries.length(); i++) { + JSONObject jsonO = (JSONObject) jsonEntries.get(i); // media url if (jsonO.get("url") != null) { - String videoUrl = jsonO.get("url").getAsString(); + String videoUrl = jsonO.getString("url"); addVideoOutlink(uri, jsonO, videoUrl); } // make sure we extract watch page links from youtube playlists, // and equivalent for other sites if (jsonO.get("webpage_url") != null) { - String webpageUrl = jsonO.get("webpage_url").getAsString(); + String webpageUrl = jsonO.getString("webpage_url"); try { UURI dest = UURIFactory.getInstance(uri.getUURI(), webpageUrl); CrawlURI link = uri.createCrawlURI(dest, LinkContext.NAVLINK_MISC, @@ -205,14 +198,14 @@ protected void extract(CrawlURI uri) { // XXX this can be large, consider using a RecordingOutputStream uri.getData().put("ydlJson", ydlJson); - String annotation = "youtube-dl:" + count; + String annotation = "youtube-dl:" + jsonEntries.length(); uri.getAnnotations().add(annotation); logContainingPage(uri, annotation); } } } - protected void addVideoOutlink(CrawlURI uri, JsonObject json, + protected void addVideoOutlink(CrawlURI uri, JSONObject jsonO, String videoUrl) { try { UURI dest = UURIFactory.getInstance(uri.getUURI(), videoUrl); @@ -221,9 +214,9 @@ protected void addVideoOutlink(CrawlURI uri, JsonObject json, // annotation String annotation = "youtube-dl:1/1"; - if (!json.get("playlist_index").isJsonNull()) { - annotation = "youtube-dl:" + json.get("playlist_index") + "/" - + json.get("n_entries"); + if (jsonO.opt("playlist_index") != null) { + annotation = "youtube-dl:" + jsonO.get("playlist_index") + "/" + + jsonO.get("n_entries"); } link.getAnnotations().add(annotation); @@ -350,7 +343,7 @@ public String call() throws IOException { return output; } - protected JsonObject runYoutubeDL(CrawlURI uri) { + protected JSONObject runYoutubeDL(CrawlURI uri) { /* * --format=best * @@ -399,13 +392,10 @@ protected JsonObject runYoutubeDL(CrawlURI uri) { proc.destroyForcibly(); } - JsonStreamParser parser = new JsonStreamParser(output.stdout); - JsonObject ydlJson = null; try { - if (parser.hasNext()) { - ydlJson = (JsonObject) parser.next(); - } - } catch (JsonParseException e) { + JSONObject ydlJson = new JSONObject(output.stdout); + return ydlJson; + } catch (JSONException e) { // sometimes we get no output at all from youtube-dl, which // manifests as a JsonIOException logger.log(Level.FINE, @@ -415,8 +405,6 @@ protected JsonObject runYoutubeDL(CrawlURI uri) { e); return null; } - - return ydlJson; } @Override @@ -480,13 +468,10 @@ public WARCRecordInfo buildRecord(CrawlURI curi, URI concurrentTo) recordInfo.setMimetype("application/vnd.youtube-dl_formats+json;charset=utf-8"); recordInfo.setEnforceLength(true); - JsonObject ydlJson = (JsonObject) curi.getData().get("ydlJson"); - StringWriter stringWriter = new StringWriter(); - JsonWriter jsonWriter = new JsonWriter(stringWriter); - jsonWriter.setIndent(" "); - Streams.write(ydlJson, jsonWriter); + JSONObject ydlJson = (JSONObject) curi.getData().get("ydlJson"); + String ydlJsonString = ydlJson.toString(1); - byte[] b = stringWriter.toString().getBytes("UTF-8"); + byte[] b = ydlJsonString.getBytes("UTF-8"); recordInfo.setContentStream(new ByteArrayInputStream(b)); recordInfo.setContentLength((long) b.length); From ff8ebc4c2e4f8b263ad9aad6a21f2a3e5d3c4d66 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Mon, 18 Nov 2019 14:42:38 -0800 Subject: [PATCH 047/123] use JSONObject.isNull() because opt() returns org.json.JSONObject.Null --- .../org/archive/modules/extractor/ExtractorYoutubeDL.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/contrib/src/main/java/org/archive/modules/extractor/ExtractorYoutubeDL.java b/contrib/src/main/java/org/archive/modules/extractor/ExtractorYoutubeDL.java index 0fc0d070d..41f72d31b 100644 --- a/contrib/src/main/java/org/archive/modules/extractor/ExtractorYoutubeDL.java +++ b/contrib/src/main/java/org/archive/modules/extractor/ExtractorYoutubeDL.java @@ -175,7 +175,7 @@ protected void extract(CrawlURI uri) { JSONObject jsonO = (JSONObject) jsonEntries.get(i); // media url - if (jsonO.get("url") != null) { + if (!jsonO.isNull("url")) { String videoUrl = jsonO.getString("url"); addVideoOutlink(uri, jsonO, videoUrl); } @@ -214,7 +214,7 @@ protected void addVideoOutlink(CrawlURI uri, JSONObject jsonO, // annotation String annotation = "youtube-dl:1/1"; - if (jsonO.opt("playlist_index") != null) { + if (!jsonO.isNull("playlist_index")) { annotation = "youtube-dl:" + jsonO.get("playlist_index") + "/" + jsonO.get("n_entries"); } From 204491c35758c6b595b409a29e8d2fd200aa53de Mon Sep 17 00:00:00 2001 From: Colin Rosenthal Date: Fri, 12 Oct 2018 14:24:05 +0200 Subject: [PATCH 048/123] Attempt to filter out embedded images. (cherry picked from commit aa5ff1dbdaefd04652a9c66506d20f1a6ae01dc3) --- .../modules/extractor/ExtractorHTML.java | 24 ++++++++++--------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/modules/src/main/java/org/archive/modules/extractor/ExtractorHTML.java b/modules/src/main/java/org/archive/modules/extractor/ExtractorHTML.java index c05959407..ebcaf47a0 100644 --- a/modules/src/main/java/org/archive/modules/extractor/ExtractorHTML.java +++ b/modules/src/main/java/org/archive/modules/extractor/ExtractorHTML.java @@ -445,18 +445,20 @@ protected void processGeneralTag(CrawlURI curi, CharSequence element, } else if (attr.start(5) > -1) { // SRC etc. CharSequence context = elementContext(element, attr.group(5)); - - // true, if we expect another HTML page instead of an image etc. - final Hop hop; - - if(!framesAsEmbeds - && (elementStr.equalsIgnoreCase(FRAME) || elementStr - .equalsIgnoreCase(IFRAME))) { - hop = Hop.NAVLINK; - } else { - hop = Hop.EMBED; + if (!context.toString().toLowerCase().startsWith("data:")) { + + // true, if we expect another HTML page instead of an image etc. + final Hop hop; + + if (!framesAsEmbeds + && (elementStr.equalsIgnoreCase(FRAME) || elementStr + .equalsIgnoreCase(IFRAME))) { + hop = Hop.NAVLINK; + } else { + hop = Hop.EMBED; + } + processEmbed(curi, value, context, hop); } - processEmbed(curi, value, context, hop); } else if (attr.start(6) > -1) { // CODEBASE codebase = (value instanceof String)? From 125547b93d6363417ccb1e59f2309d4e2c949d59 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 19 Nov 2019 14:34:49 -0800 Subject: [PATCH 049/123] change trough dedup `date` type to varchar. By parsing/unparsing to/from java.util.Date, we ended up with a different date format in trough (sqlite) than warcprox, which is no good; see https://github.com/internetarchive/warcprox/pull/144 --- .../recrawl/TroughContentDigestHistory.java | 23 ++++--------------- 1 file changed, 4 insertions(+), 19 deletions(-) diff --git a/contrib/src/main/java/org/archive/modules/recrawl/TroughContentDigestHistory.java b/contrib/src/main/java/org/archive/modules/recrawl/TroughContentDigestHistory.java index 721efc964..0731395d9 100644 --- a/contrib/src/main/java/org/archive/modules/recrawl/TroughContentDigestHistory.java +++ b/contrib/src/main/java/org/archive/modules/recrawl/TroughContentDigestHistory.java @@ -5,13 +5,9 @@ import static org.archive.modules.recrawl.RecrawlAttributeConstants.A_WARC_RECORD_ID; import java.net.MalformedURLException; -import java.text.SimpleDateFormat; -import java.util.Date; import java.util.HashMap; import java.util.List; -import java.util.Locale; import java.util.Map; -import java.util.TimeZone; import java.util.logging.Level; import java.util.logging.Logger; @@ -22,7 +18,6 @@ import org.archive.spring.KeyedProperties; import org.archive.trough.TroughClient; import org.archive.trough.TroughClient.TroughNoReadUrlException; -import org.archive.util.ArchiveUtils; import org.springframework.context.ApplicationListener; /** @@ -84,7 +79,7 @@ protected TroughClient troughClient() throws MalformedURLException { protected static final String SCHEMA_SQL = "create table dedup (\n" + " digest_key varchar(100) primary key,\n" + " url varchar(2100) not null,\n" - + " date datetime not null,\n" + + " date varchar(100) not null,\n" + " id varchar(100));\n"; // warc record id @Override @@ -121,15 +116,6 @@ public void onApplicationEvent(CrawlStateEvent event) { } } - // dates come back from sqlite in this format: 2019-03-14 00:49:14 - protected static ThreadLocal SQLITE_DATE_FORMAT = new ThreadLocal() { - protected SimpleDateFormat initialValue() { - SimpleDateFormat df = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.ENGLISH); - df.setTimeZone(TimeZone.getTimeZone("GMT")); - return df; - } - }; - @Override public void load(CrawlURI curi) { // make this call in all cases so that the value is initialized and @@ -142,13 +128,12 @@ public void load(CrawlURI curi) { if (!results.isEmpty()) { Map hist = new HashMap(); hist.put(A_ORIGINAL_URL, results.get(0).get("url")); - Date date = SQLITE_DATE_FORMAT.get().parse((String) results.get(0).get("date")); - hist.put(A_ORIGINAL_DATE, ArchiveUtils.getLog14Date(date)); + hist.put(A_ORIGINAL_DATE, results.get(0).get("date")); hist.put(A_WARC_RECORD_ID, results.get(0).get("id")); if (logger.isLoggable(Level.FINER)) { logger.finer("loaded history by digest " + persistKeyFor(curi) - + " for uri " + curi + " - " + hist); + + " for uri " + curi + " - " + hist); } contentDigestHistory.putAll(hist); } @@ -173,7 +158,7 @@ public void store(CrawlURI curi) { try { String digestKey = persistKeyFor(curi); Object url = hist.get(A_ORIGINAL_URL); - Date date = ArchiveUtils.parse14DigitISODate((String) hist.get(A_ORIGINAL_DATE), null); + Object date = hist.get(A_ORIGINAL_DATE); Object recordId = hist.get(A_WARC_RECORD_ID); Object[] values = new Object[] { digestKey, url, date, recordId }; troughClient().write(getSegmentId(), WRITE_SQL_TMPL, values, SCHEMA_ID); From 31b99605030f86cdc521b114a49fe089517af749 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Mon, 30 Dec 2019 16:02:10 -0800 Subject: [PATCH 050/123] basic level of documentation --- .../modules/warc/WARCRecordBuilder.java | 30 +++++++++++++++++++ .../writer/WARCWriterChainProcessor.java | 27 +++++++++++++++++ 2 files changed, 57 insertions(+) diff --git a/modules/src/main/java/org/archive/modules/warc/WARCRecordBuilder.java b/modules/src/main/java/org/archive/modules/warc/WARCRecordBuilder.java index afd19bbe9..2b5d2ebad 100644 --- a/modules/src/main/java/org/archive/modules/warc/WARCRecordBuilder.java +++ b/modules/src/main/java/org/archive/modules/warc/WARCRecordBuilder.java @@ -6,10 +6,40 @@ import org.archive.io.warc.WARCRecordInfo; import org.archive.modules.CrawlURI; +/** + * Implementations of this interface are each responsible for building a + * particular type of WARC record. + * + * @author nlevitt + */ public interface WARCRecordBuilder { + /** + * Decides whether to build a record for the given capture. + * + *

+ * For example, {@link DnsResponseRecordBuilder#shouldBuildRecord(CrawlURI)} + * will return true if and only if curi is a capture of a dns: url. + * + * @param curi a captured url + * @return true if it is appropriate for this + * {@link WARCRecordBuilder} to build a record for this capture, + * false otherwise + */ boolean shouldBuildRecord(CrawlURI curi); + /** + * Builds a warc record for this capture. + * + * @param curi a captured url + * @param concurrentTo implementations should do this: + *

    if (concurrentTo != null) {
+     *        recordInfo.addExtraHeader(HEADER_KEY_CONCURRENT_TO,
+     *                "<" + concurrentTo + ">");
+     *    }
+ * @return the freshly built warc record + * @throws IOException + */ WARCRecordInfo buildRecord(CrawlURI curi, URI concurrentTo) throws IOException; diff --git a/modules/src/main/java/org/archive/modules/writer/WARCWriterChainProcessor.java b/modules/src/main/java/org/archive/modules/writer/WARCWriterChainProcessor.java index 895f72608..4fc455584 100644 --- a/modules/src/main/java/org/archive/modules/writer/WARCWriterChainProcessor.java +++ b/modules/src/main/java/org/archive/modules/writer/WARCWriterChainProcessor.java @@ -23,6 +23,33 @@ import org.archive.modules.warc.WhoisResponseRecordBuilder; import org.archive.spring.HasKeyedProperties; +/** + * WARC writer processor. The types of records that to be written can be + * configured by including or excluding {@link WARCRecordBuilder} + * implementations (see {@link #setChain(List)}). + * + *

This is the default chain: + *

+ *   <property name="chain">
+ *    <list>
+ *     <bean class="org.archive.modules.warc.DnsResponseRecordBuilder"/>
+ *     <bean class="org.archive.modules.warc.HttpResponseRecordBuilder"/>
+ *     <bean class="org.archive.modules.warc.WhoisResponseRecordBuilder"/>
+ *     <bean class="org.archive.modules.warc.FtpControlConversationRecordBuilder"/>
+ *     <bean class="org.archive.modules.warc.FtpResponseRecordBuilder"/>
+ *     <bean class="org.archive.modules.warc.RevisitRecordBuilder"/>
+ *     <bean class="org.archive.modules.warc.HttpRequestRecordBuilder"/>
+ *     <bean class="org.archive.modules.warc.MetadataRecordBuilder"/>
+ *    </list>
+ *   </property>
+ * 
+ * + *

+ * Replaces {@link WARCWriterProcessor}. + * + * @see WARCRecordBuilder + * @author nlevitt + */ public class WARCWriterChainProcessor extends BaseWARCWriterProcessor implements HasKeyedProperties { private static final Logger logger = Logger.getLogger(WARCWriterChainProcessor.class.getName()); From 34be934da5b27d1fd919c18549095ddca611a68a Mon Sep 17 00:00:00 2001 From: Alex Osborne Date: Wed, 15 Jan 2020 16:51:19 +0900 Subject: [PATCH 051/123] Fix 'Method Not Allowed' on POST of config editor form Under Restlet 2 `getVariants()` always returns null for a POST so our post() method was never called and 405 Method Not Allowed was returned. We don't need content negotiation for the POST response anyway so let's instead override the no-variants post() like we do for put(). Fixes #293 --- .../crawler/restlet/EnhDirectoryResource.java | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/engine/src/main/java/org/archive/crawler/restlet/EnhDirectoryResource.java b/engine/src/main/java/org/archive/crawler/restlet/EnhDirectoryResource.java index ffebd7e65..3ad33d9e5 100644 --- a/engine/src/main/java/org/archive/crawler/restlet/EnhDirectoryResource.java +++ b/engine/src/main/java/org/archive/crawler/restlet/EnhDirectoryResource.java @@ -126,22 +126,14 @@ protected EnhDirectory getEnhDirectory() { /** * Accept a POST used to edit or create a file. * - * @see org.restlet.resource.ServerResource#post(Representation, Variant) + * @see org.restlet.resource.ServerResource#post(Representation) */ @Override - protected Representation post(Representation entity, Variant variant) throws ResourceException { + protected Representation post(Representation entity) throws ResourceException { // TODO: only allowPost on valid targets Form form = new Form(entity); String newContents = form.getFirstValue("contents"); - EditRepresentation er; - try { - er = (EditRepresentation) getVariants().get(0); - } catch (ClassCastException cce) { - throw new ResourceException(Status.CLIENT_ERROR_BAD_REQUEST, - "File modification should use either PUT or " + - "POST with a '?format=textedit' query-string."); - } - File file = er.getFileRepresentation().getFile(); + File file = new File(URI.create(getTargetUri())); try { FileUtils.writeStringToFile(file, newContents,"UTF-8"); Flash.addFlash(getResponse(), "file updated"); From dcdaba3b100fe1f5f5bdd7ab8225f8294703109f Mon Sep 17 00:00:00 2001 From: Alex Osborne Date: Wed, 15 Jan 2020 17:48:31 +0900 Subject: [PATCH 052/123] Disable test that connects to wwwb-dedup.us.archive.org http://wwwb-dedup.us.archive.org:8083/web/timemap/cdx is currently returning 503 intermittently and so this is breaking Travis builds for unrelated changes. --- .../modules/recrawl/wbm/WbmPersistLoadProcessorTest.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/contrib/src/test/java/org/archive/modules/recrawl/wbm/WbmPersistLoadProcessorTest.java b/contrib/src/test/java/org/archive/modules/recrawl/wbm/WbmPersistLoadProcessorTest.java index 88d1c0cf9..a36944285 100644 --- a/contrib/src/test/java/org/archive/modules/recrawl/wbm/WbmPersistLoadProcessorTest.java +++ b/contrib/src/test/java/org/archive/modules/recrawl/wbm/WbmPersistLoadProcessorTest.java @@ -156,8 +156,9 @@ public void _testInnerProcessResultSingleShotWithMock() throws Exception { FetchHistoryProcessor fhp = new FetchHistoryProcessor(); fhp.process(curi); } - - public void testInnerProcessResultSingleShotWithRealServer() throws Exception { + + // DISABLED: this relies on wwwb-dedup.us.archive.org which is intermittently returning 503 + public void xtestInnerProcessResultSingleShotWithRealServer() throws Exception { WbmPersistLoadProcessor t = new WbmPersistLoadProcessor(); //CrawlURI curi = new CrawlURI(UURIFactory.getInstance("http://archive.org/")); CrawlURI curi = new CrawlURI(UURIFactory.getInstance("http://www.mext.go.jp/null.gif")); From 615578b22df753bc3ec2349520cccae1b80a7b8d Mon Sep 17 00:00:00 2001 From: Alex Osborne Date: Wed, 15 Jan 2020 18:19:25 +0900 Subject: [PATCH 053/123] Add IntelliJ IDEA and Heritrix runtime files to .gitignore I've seen a few people accidentally commit them. --- .gitignore | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.gitignore b/.gitignore index e40eb2ded..a06895880 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,8 @@ contrib/target */.classpath */.project */.settings +.idea +*.iml +/adhoc.keystore +/heritrix_dmesg.log +/jobs From 4be3e620c38e543477d6406e848cc912d9098042 Mon Sep 17 00:00:00 2001 From: Tim Hennekey Date: Wed, 15 Jan 2020 11:08:31 -0500 Subject: [PATCH 054/123] Set JUnit version to latest Updating JUnit to begin updating the unit tests --- pom.xml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pom.xml b/pom.xml index 2cf5e5648..a3bdfbd95 100644 --- a/pom.xml +++ b/pom.xml @@ -134,6 +134,11 @@ http://maven.apache.org/guides/mini/guide-m1-m2.html httpmime 4.3.6 + + junit + junit + 4.13 + From bb61aa0bcbf0eecc9a8dd65bf84886569bb39178 Mon Sep 17 00:00:00 2001 From: Tim Hennekey Date: Wed, 15 Jan 2020 12:09:24 -0500 Subject: [PATCH 055/123] Fix unchecked warning --- .../org/archive/util/ObjectIdentityBdbManualCacheTest.java | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/commons/src/test/java/org/archive/util/ObjectIdentityBdbManualCacheTest.java b/commons/src/test/java/org/archive/util/ObjectIdentityBdbManualCacheTest.java index da5175e0e..1d2a3c6e0 100644 --- a/commons/src/test/java/org/archive/util/ObjectIdentityBdbManualCacheTest.java +++ b/commons/src/test/java/org/archive/util/ObjectIdentityBdbManualCacheTest.java @@ -54,10 +54,9 @@ protected void tearDown() throws Exception { super.tearDown(); } - @SuppressWarnings("unchecked") public void testReadConsistencyUnderLoad() throws Exception { final ObjectIdentityBdbManualCache> cbdbmap = - new ObjectIdentityBdbManualCache(); + new ObjectIdentityBdbManualCache<>(); cbdbmap.initialize(env, "consistencyCache", IdentityCacheableWrapper.class, From 611fa325df12e0216d5cb349a307a7ff12073977 Mon Sep 17 00:00:00 2001 From: Tim Hennekey Date: Wed, 15 Jan 2020 12:56:10 -0500 Subject: [PATCH 056/123] Remove unused code It reduces the clarity of the surrounding code --- .../ObjectIdentityBdbManualCacheTest.java | 31 ------------------- 1 file changed, 31 deletions(-) diff --git a/commons/src/test/java/org/archive/util/ObjectIdentityBdbManualCacheTest.java b/commons/src/test/java/org/archive/util/ObjectIdentityBdbManualCacheTest.java index 1d2a3c6e0..1a688616c 100644 --- a/commons/src/test/java/org/archive/util/ObjectIdentityBdbManualCacheTest.java +++ b/commons/src/test/java/org/archive/util/ObjectIdentityBdbManualCacheTest.java @@ -23,7 +23,6 @@ import java.util.concurrent.atomic.AtomicInteger; import org.apache.commons.io.FileUtils; -import org.apache.commons.lang.math.RandomUtils; import org.archive.util.bdbje.EnhancedEnvironment; /** @@ -74,36 +73,6 @@ public void testReadConsistencyUnderLoad() throws Exception { new IdentityCacheableWrapper( key, new AtomicInteger(level.get())))); } - // backward checking that all values always at level or higher - new Thread() { - public void run() { - untilmax: while(true) { - for(int j=keyCount-1; j >= 0; j--) { - int targetValue = level.get(); - if(targetValue>=maxLevel) { - break untilmax; - } - assertTrue("stale value revseq key "+j,cbdbmap.get(""+j).get().get()>=targetValue); - Thread.yield(); - } - } - } - };//.start(); - // random checking that all values always at level or higher - new Thread() { - public void run() { - untilmax: while(true) { - int j = RandomUtils.nextInt(keyCount); - int targetValue = level.get(); - if(targetValue>=maxLevel) { - break untilmax; - } - assertTrue("stale value random key "+j, - cbdbmap.get(""+j).get().get()>=targetValue); - Thread.yield(); - } - } - };//.start(); // increment all keys for(; level.get() < maxLevel; level.incrementAndGet()) { for(int k = 0; k < keyCount; k++) { From 0fe8463e81d13c658b68da29bddc18efd1f589cd Mon Sep 17 00:00:00 2001 From: Tim Hennekey Date: Wed, 15 Jan 2020 13:04:09 -0500 Subject: [PATCH 057/123] Remove AtomicInteger loop counter This test isn't multithreaded so I see no need for this --- .../util/ObjectIdentityBdbManualCacheTest.java | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/commons/src/test/java/org/archive/util/ObjectIdentityBdbManualCacheTest.java b/commons/src/test/java/org/archive/util/ObjectIdentityBdbManualCacheTest.java index 1a688616c..2873be5c5 100644 --- a/commons/src/test/java/org/archive/util/ObjectIdentityBdbManualCacheTest.java +++ b/commons/src/test/java/org/archive/util/ObjectIdentityBdbManualCacheTest.java @@ -61,7 +61,6 @@ public void testReadConsistencyUnderLoad() throws Exception { IdentityCacheableWrapper.class, env.getClassCatalog()); try { - final AtomicInteger level = new AtomicInteger(0); final int keyCount = 128 * 1024; // 128K keys final int maxLevel = 64; // initial fill @@ -71,19 +70,19 @@ public void testReadConsistencyUnderLoad() throws Exception { key, new Supplier>( new IdentityCacheableWrapper( - key, new AtomicInteger(level.get())))); + key, new AtomicInteger(0)))); } // increment all keys - for(; level.get() < maxLevel; level.incrementAndGet()) { + for(int level = 0; level < maxLevel; level++) { for(int k = 0; k < keyCount; k++) { IdentityCacheableWrapper wrap = cbdbmap.get(""+k); int foundValue = wrap.get().getAndIncrement(); wrap.makeDirty(); - assertEquals("stale value preinc key "+k, level.get(), foundValue); + assertEquals("stale value preinc key "+k, level, foundValue); } - if(level.get() % 10 == 0) { - System.out.println("level to "+level.get()); - if(level.get()>0) { + if(level % 10 == 0) { + System.out.println("level to "+level); + if(level>0) { TestUtils.forceScarceMemory(); } System.out.println("OIBMCT:"+cbdbmap.composeCacheSummary()); From 55308ba90ac0a7dc7f75bc346d0bd6189f2d7047 Mon Sep 17 00:00:00 2001 From: Tim Hennekey Date: Wed, 15 Jan 2020 13:29:32 -0500 Subject: [PATCH 058/123] Move IdentityCacheableWrapper This is utilized only in tests so it belongs there --- .../java/org/archive/util/IdentityCacheableWrapper.java | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename commons/src/{main => test}/java/org/archive/util/IdentityCacheableWrapper.java (100%) diff --git a/commons/src/main/java/org/archive/util/IdentityCacheableWrapper.java b/commons/src/test/java/org/archive/util/IdentityCacheableWrapper.java similarity index 100% rename from commons/src/main/java/org/archive/util/IdentityCacheableWrapper.java rename to commons/src/test/java/org/archive/util/IdentityCacheableWrapper.java From 0fa87a36e8f1ea4440f40ea722f32e3891f0d012 Mon Sep 17 00:00:00 2001 From: Tim Hennekey Date: Wed, 15 Jan 2020 13:35:45 -0500 Subject: [PATCH 059/123] Add default constructor to IdentityCacheWrapper Kryo is spending a lot of time during serialization handling a NoSuchMethodException. By adding this default constructor we can skip doing all that work. --- .../test/java/org/archive/util/IdentityCacheableWrapper.java | 3 +++ 1 file changed, 3 insertions(+) diff --git a/commons/src/test/java/org/archive/util/IdentityCacheableWrapper.java b/commons/src/test/java/org/archive/util/IdentityCacheableWrapper.java index 60b0e4e7c..a93eb6dfa 100644 --- a/commons/src/test/java/org/archive/util/IdentityCacheableWrapper.java +++ b/commons/src/test/java/org/archive/util/IdentityCacheableWrapper.java @@ -31,6 +31,9 @@ public class IdentityCacheableWrapper implements IdentityCacheable { protected K wrapped; + // For Kryo serialization + IdentityCacheableWrapper() { } + public IdentityCacheableWrapper(String key, K wrapped) { super(); this.wrapped = wrapped; From 080380b18a9f8b8c8772a2d665eb09e0632eab2f Mon Sep 17 00:00:00 2001 From: Tim Hennekey Date: Wed, 15 Jan 2020 18:13:20 -0500 Subject: [PATCH 060/123] Use Guice instead of custom implementation This uses the avaialable code in Guice rather than a custom implementation. It also provides a performance increase (as demonstrated by the unit tests) --- .../org/archive/util/BloomFilter64bit.java | 259 +++--------------- 1 file changed, 45 insertions(+), 214 deletions(-) diff --git a/commons/src/main/java/org/archive/util/BloomFilter64bit.java b/commons/src/main/java/org/archive/util/BloomFilter64bit.java index eaf578745..b2ba81b6e 100644 --- a/commons/src/main/java/org/archive/util/BloomFilter64bit.java +++ b/commons/src/main/java/org/archive/util/BloomFilter64bit.java @@ -27,91 +27,30 @@ package org.archive.util; import java.io.Serializable; +import java.lang.reflect.Field; +import java.lang.reflect.Method; import java.security.SecureRandom; import java.util.Random; -/** A Bloom filter. - * - * ADAPTED/IMPROVED VERSION OF MG4J it.unimi.dsi.mg4j.util.BloomFilter - * - *

KEY CHANGES: - * - *

    - *
  • NUMBER_OF_WEIGHTS is 2083, to better avoid collisions between - * similar strings (common in the domain of URIs)
  • - * - *
  • Removed dependence on cern.colt MersenneTwister (replaced with - * SecureRandom) and QuickBitVector (replaced with local methods).
  • - * - *
  • Adapted to allow long bit indices
  • - * - *
  • Stores bitfield in an array of up to 2^22 arrays of 2^26 longs. Thus, - * bitfield may grow to 2^48 longs in size -- 2PiB, 2*54 bitfield indexes. - * (I expect this will outstrip available RAM for the next few years.)
  • - *
- * - *
- * - *

Instances of this class represent a set of character sequences (with - * false positives) using a Bloom filter. Because of the way Bloom filters work, - * you cannot remove elements. - * - *

Bloom filters have an expected error rate, depending on the number - * of hash functions used, on the filter size and on the number of elements in - * the filter. This implementation uses a variable optimal number of hash - * functions, depending on the expected number of elements. More precisely, a - * Bloom filter for n character sequences with d hash - * functions will use ln 2 dn ≈ - * 1.44 dn bits; false positives will happen with - * probability 2-d. - * - *

Hash functions are generated at creation time using universal hashing. - * Each hash function uses {@link #NUMBER_OF_WEIGHTS} random integers, which - * are cyclically multiplied by the character codes in a character sequence. - * The resulting integers are XOR-ed together. - * - *

This class exports access methods that are very similar to those of - * {@link java.util.Set}, but it does not implement that interface, as too - * many non-optional methods would be unimplementable (e.g., iterators). - * - * @author Sebastiano Vigna - * @author Gordon Mohr - */ +import com.google.common.annotations.VisibleForTesting; +import com.google.common.hash.Funnels; +import com.google.common.primitives.Ints; + public class BloomFilter64bit implements Serializable, BloomFilter { - private static final long serialVersionUID = 2L; + private static final long serialVersionUID = 3L; - /** The number of weights used to create hash functions. */ - protected final static int NUMBER_OF_WEIGHTS = 2083; // CHANGED FROM 16 - /** The number of bits in this filter. */ - final protected long m; - /** if bitfield is an exact power of 2 in length, it is this power */ - protected int power = -1; /** The expected number of inserts; determines calculated size */ - final protected long expectedInserts; - /** The number of hash functions used by this filter. */ - final protected int d; - /** The underlying bit vector */ - final protected long[][] bits; - /** The random integers used to generate the hash functions. */ - final protected long[][] weight; + private final long expectedInserts; /** The number of elements currently in the filter. It may be * smaller than the actual number of additions of distinct character * sequences because of false positives. */ - protected int size; - - /** The natural logarithm of 2, used in the computation of the number of bits. */ - protected final static double NATURAL_LOG_OF_2 = Math.log( 2 ); - - /** power-of-two to use as maximum size of bitfield subarrays */ - protected final static int SUBARRAY_POWER_OF_TWO = 26; // 512MiB of longs - /** number of longs in one subarray */ - protected final static int SUBARRAY_LENGTH_IN_LONGS = 1 << SUBARRAY_POWER_OF_TWO; - /** mask for lowest SUBARRAY_POWER_OF_TWO bits */ - protected final static int SUBARRAY_MASK = SUBARRAY_LENGTH_IN_LONGS - 1; //0x0FFFFFFF + private int size; - protected final static boolean DEBUG = false; + private final com.google.common.hash.BloomFilter delegate; + private final long bitSize; + private final int numHashFunctions; /** Creates a new Bloom filter with given number of hash functions and * expected number of elements. @@ -141,45 +80,18 @@ public BloomFilter64bit( final long n, final int d, boolean roundUp) { * @param roundUp if true, round bit size up to next-nearest-power-of-2 */ public BloomFilter64bit(final long n, final int d, Random weightsGenerator, boolean roundUp ) { + delegate = com.google.common.hash.BloomFilter.create(Funnels.unencodedCharsFunnel(), Ints.saturatedCast(n), 0.0000003); this.expectedInserts = n; - this.d = d; - long lenInLongs = (long)Math.ceil( ( (long)n * (long)d / NATURAL_LOG_OF_2 ) / 64L ); - if ( lenInLongs > (1L<<48) ) { - throw new IllegalArgumentException( - "This filter would require " + lenInLongs + " longs, " + - "greater than this classes maximum of 2^48 longs (2PiB)." ); - } - long lenInBits = lenInLongs * 64L; - - if(roundUp) { - int pow = 0; - while((1L<s. - * @param k a hash function index (smaller than {@link #d}). - * @return the position in the filter corresponding to s for the hash function k. - */ - protected long hash( final CharSequence s, final int l, final int k ) { - final long[] w = weight[ k ]; - long h = 0; - int i = l; - while( i-- != 0 ) h ^= s.charAt( i ) * w[ i % NUMBER_OF_WEIGHTS ]; - long retVal; - if(power>0) { - retVal = h >>> (64-power); - } else { - // ####----####---- - retVal = ( h & 0x7FFFFFFFFFFFFFFFL ) % m; - } - return retVal; - } - - public long[] bitIndexesFor(CharSequence s) { - long[] ret = new long[d]; - for(int i = 0; i < d; i++) { - ret[i] = hash(s,s.length(),i); - } - return ret; - } - /** Checks whether the given character sequence is in this filter. * *

Note that this method may return true on a character sequence that is has @@ -237,9 +119,7 @@ public long[] bitIndexesFor(CharSequence s) { */ public boolean contains( final CharSequence s ) { - int i = d, l = s.length(); - while( i-- != 0 ) if ( ! getBit( hash( s, l, i ) ) ) return false; - return true; + return delegate.mightContain(s); } /** Adds a character sequence to the filter. @@ -249,80 +129,17 @@ public boolean contains( final CharSequence s ) { */ public boolean add( final CharSequence s ) { - boolean result = false; - int i = d, l = s.length(); - long h; - while( i-- != 0 ) { - h = hash( s, l, i ); - if ( ! setGetBit( h ) ) { - result = true; - } - } - if ( result ) size++; - return result; + size++; + return delegate.put(s); } - - protected final static long ADDRESS_BITS_PER_UNIT = 6; // 64=2^6 - protected final static long BIT_INDEX_MASK = (1<<6)-1; // = 63 = 2^BITS_PER_UNIT - 1; - /** - * Returns from the local bitvector the value of the bit with - * the specified index. The value is true if the bit - * with the index bitIndex is currently set; otherwise, - * returns false. - * - * (adapted from cern.colt.bitvector.QuickBitVector) - * - * @param bitIndex the bit index. - * @return the value of the bit with the specified index. + /* (non-Javadoc) + * @see org.archive.util.BloomFilter#getSizeBytes() */ - public boolean getBit(long bitIndex) { - long longIndex = bitIndex >>> ADDRESS_BITS_PER_UNIT; - int arrayIndex = (int) (longIndex >>> SUBARRAY_POWER_OF_TWO); - int subarrayIndex = (int) (longIndex & SUBARRAY_MASK); - return ((bits[arrayIndex][subarrayIndex] & (1L << (bitIndex & BIT_INDEX_MASK))) != 0); + public long getSizeBytes() { + return bitSize / 8; } - /** - * Changes the bit with index bitIndex in local bitvector. - * - * (adapted from cern.colt.bitvector.QuickBitVector) - * - * @param bitIndex the index of the bit to be set. - */ - protected void setBit( long bitIndex) { - long longIndex = bitIndex >>> ADDRESS_BITS_PER_UNIT; - int arrayIndex = (int) (longIndex >>> SUBARRAY_POWER_OF_TWO); - int subarrayIndex = (int) (longIndex & SUBARRAY_MASK); - bits[arrayIndex][subarrayIndex] |= (1L << (bitIndex & BIT_INDEX_MASK)); - } - - /** - * Sets the bit with index bitIndex in local bitvector -- - * returning the old value. - * - * (adapted from cern.colt.bitvector.QuickBitVector) - * - * @param bitIndex the index of the bit to be set. - */ - protected boolean setGetBit( long bitIndex) { - long longIndex = bitIndex >>> ADDRESS_BITS_PER_UNIT; - int arrayIndex = (int) (longIndex >>> SUBARRAY_POWER_OF_TWO); - int subarrayIndex = (int) (longIndex & SUBARRAY_MASK); - long mask = 1L << (bitIndex & BIT_INDEX_MASK); - boolean ret = (bits[arrayIndex][subarrayIndex] & mask)!=0; - bits[arrayIndex][subarrayIndex] |= mask; - return ret; - } - - /* (non-Javadoc) - * @see org.archive.util.BloomFilter#getSizeBytes() - */ - public long getSizeBytes() { - // account for ragged-sized last array - return 8*(((bits.length-1)*bits[0].length)+bits[bits.length-1].length); - } - @Override public long getExpectedInserts() { return expectedInserts; @@ -330,6 +147,20 @@ public long getExpectedInserts() { @Override public long getHashCount() { - return d; + return numHashFunctions; + } + + @VisibleForTesting + public boolean getBit(long bitIndex) { + try { + Field bitsField = delegate.getClass().getDeclaredField("bits"); + bitsField.setAccessible(true); + Object bitarray = bitsField.get(delegate); + Method getBitMethod = bitarray.getClass().getDeclaredMethod("get", long.class); + getBitMethod.setAccessible(true); + return (boolean) getBitMethod.invoke(bitarray, bitIndex); + } catch (Exception e) { + throw new RuntimeException(e); + } } } From 66723aea68019ec122efeae48b16c2f77e11b753 Mon Sep 17 00:00:00 2001 From: Tim Hennekey Date: Tue, 21 Jan 2020 10:52:15 -0500 Subject: [PATCH 061/123] Remove version Allow the parent POM to specify the version --- commons/pom.xml | 1 - 1 file changed, 1 deletion(-) diff --git a/commons/pom.xml b/commons/pom.xml index 3a121b022..910d0d09e 100644 --- a/commons/pom.xml +++ b/commons/pom.xml @@ -119,7 +119,6 @@ junit junit - 3.8.2 compile From 784b3a5dec90c9440c9e5db41cc11adda6033b24 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 22 Jan 2020 13:39:24 -0800 Subject: [PATCH 062/123] fix logging config by setting system property java.util.logging.config.file, because new version restlet reconfigures logging after heritrix has already configured it --- engine/src/main/java/org/archive/crawler/Heritrix.java | 3 +++ 1 file changed, 3 insertions(+) diff --git a/engine/src/main/java/org/archive/crawler/Heritrix.java b/engine/src/main/java/org/archive/crawler/Heritrix.java index dd9e208ef..91b69afe9 100644 --- a/engine/src/main/java/org/archive/crawler/Heritrix.java +++ b/engine/src/main/java/org/archive/crawler/Heritrix.java @@ -314,6 +314,9 @@ public void instanceMain(String[] args) useAdhocKeystore(startupOut); } + // Restlet will reconfigure logging according to the system property + // so we must set it for -l to work properly + System.setProperty("java.util.logging.config.file", properties.getPath()); if (properties.exists()) { FileInputStream finp = new FileInputStream(properties); LogManager.getLogManager().readConfiguration(finp); From 9229a51dcb9bc3cc10b7b8d7ba8a937d60061562 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 22 Jan 2020 13:49:24 -0800 Subject: [PATCH 063/123] limit ExtractorYoutubeDL heap usage We were seeing OOME due to large youtoube-dl json (for playlists and such). So instead of storing the json in ram, stream through it, and stash the contents in an thread-local anonymous tempfile so it can be written to to warc. --- contrib/pom.xml | 5 + .../modules/extractor/ExtractorYoutubeDL.java | 374 ++++++++++++------ 2 files changed, 264 insertions(+), 115 deletions(-) diff --git a/contrib/pom.xml b/contrib/pom.xml index 750ed9fe6..2667cbcda 100644 --- a/contrib/pom.xml +++ b/contrib/pom.xml @@ -73,6 +73,11 @@ rethinkdb-driver 2.3.3 + + com.google.code.gson + gson + 2.8.6 + diff --git a/contrib/src/main/java/org/archive/modules/extractor/ExtractorYoutubeDL.java b/contrib/src/main/java/org/archive/modules/extractor/ExtractorYoutubeDL.java index 41f72d31b..81b9a1d59 100644 --- a/contrib/src/main/java/org/archive/modules/extractor/ExtractorYoutubeDL.java +++ b/contrib/src/main/java/org/archive/modules/extractor/ExtractorYoutubeDL.java @@ -21,17 +21,25 @@ import static org.archive.format.warc.WARCConstants.HEADER_KEY_CONCURRENT_TO; -import java.io.ByteArrayInputStream; +import java.io.EOFException; +import java.io.File; +import java.io.FileInputStream; import java.io.IOException; +import java.io.InputStream; import java.io.InputStreamReader; +import java.io.RandomAccessFile; import java.io.Reader; +import java.io.UnsupportedEncodingException; import java.net.URI; -import java.util.Arrays; +import java.nio.channels.Channels; +import java.util.ArrayList; +import java.util.List; import java.util.concurrent.Callable; import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.Future; +import java.util.concurrent.TimeUnit; import java.util.logging.Level; import java.util.logging.Logger; @@ -47,12 +55,12 @@ import org.archive.net.UURIFactory; import org.archive.util.ArchiveUtils; import org.archive.util.MimetypeUtils; -import org.json.JSONArray; -import org.json.JSONException; -import org.json.JSONObject; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.context.Lifecycle; +import com.google.gson.stream.JsonReader; +import com.google.gson.stream.JsonToken; + /** * Extracts links to media by running youtube-dl in a subprocess. Runs only on * html. @@ -106,6 +114,21 @@ public class ExtractorYoutubeDL extends Extractor protected transient Logger ydlLogger = null; + // unnamed toethread-local temporary file + protected transient ThreadLocal tempfile = new ThreadLocal() { + protected RandomAccessFile initialValue() { + File t; + try { + t = File.createTempFile("ydl", ".json"); + RandomAccessFile f = new RandomAccessFile(t, "rw"); + t.delete(); + return f; + } catch (IOException e) { + throw new RuntimeException(e); + } + } + }; + protected CrawlerLoggerModule crawlerLoggerModule; public CrawlerLoggerModule getCrawlerLoggerModule() { return this.crawlerLoggerModule; @@ -162,62 +185,38 @@ protected void extract(CrawlURI uri) { logCapturedVideo(uri, ydlAnnotation); } } else { - JSONObject ydlJson = runYoutubeDL(uri); - if (ydlJson != null && (ydlJson.has("entries") || ydlJson.has("url"))) { - JSONArray jsonEntries; - if (ydlJson.has("entries")) { - jsonEntries = ydlJson.getJSONArray("entries"); - } else { - jsonEntries = new JSONArray(Arrays.asList(ydlJson)); - } - - for (int i = 0; i < jsonEntries.length(); i++) { - JSONObject jsonO = (JSONObject) jsonEntries.get(i); - - // media url - if (!jsonO.isNull("url")) { - String videoUrl = jsonO.getString("url"); - addVideoOutlink(uri, jsonO, videoUrl); - } + YoutubeDLResults results = runYoutubeDL(uri); + for (int i = 0; i < results.videoUrls.size(); i++) { + addVideoOutlink(uri, results.videoUrls.get(i), i, results.videoUrls.size()); + } - // make sure we extract watch page links from youtube playlists, - // and equivalent for other sites - if (jsonO.get("webpage_url") != null) { - String webpageUrl = jsonO.getString("webpage_url"); - try { - UURI dest = UURIFactory.getInstance(uri.getUURI(), webpageUrl); - CrawlURI link = uri.createCrawlURI(dest, LinkContext.NAVLINK_MISC, - Hop.NAVLINK); - uri.getOutLinks().add(link); - } catch (URIException e1) { - logUriError(e1, uri.getUURI(), webpageUrl); - } - } + for (String pageUrl: results.pageUrls) { + try { + UURI dest = UURIFactory.getInstance(uri.getUURI(), pageUrl); + CrawlURI link = uri.createCrawlURI(dest, LinkContext.NAVLINK_MISC, + Hop.NAVLINK); + uri.getOutLinks().add(link); + } catch (URIException e1) { + logUriError(e1, uri.getUURI(), pageUrl); } + } - // XXX this can be large, consider using a RecordingOutputStream - uri.getData().put("ydlJson", ydlJson); - - String annotation = "youtube-dl:" + jsonEntries.length(); + if (results.videoUrls.size() > 0) { + String annotation = "youtube-dl:" + results.videoUrls.size(); uri.getAnnotations().add(annotation); logContainingPage(uri, annotation); } } } - protected void addVideoOutlink(CrawlURI uri, JSONObject jsonO, - String videoUrl) { + protected void addVideoOutlink(CrawlURI uri, String videoUrl, int playlistIndex, int nEntries) { try { UURI dest = UURIFactory.getInstance(uri.getUURI(), videoUrl); CrawlURI link = uri.createCrawlURI(dest, LinkContext.EMBED_MISC, Hop.EMBED); // annotation - String annotation = "youtube-dl:1/1"; - if (!jsonO.isNull("playlist_index")) { - annotation = "youtube-dl:" + jsonO.get("playlist_index") + "/" - + jsonO.get("n_entries"); - } + String annotation = "youtube-dl:" + (playlistIndex + 1) + "/" + nEntries; link.getAnnotations().add(annotation); // save info unambiguously identifying containing page capture @@ -303,47 +302,131 @@ protected void doRedirectInheritance(CrawlURI uri, String ydlAnnotation) { } } - static protected class ProcessOutput { - public String stdout; - public String stderr; + protected static class YoutubeDLResults { + RandomAccessFile jsonFile; + List videoUrls = new ArrayList(); + List pageUrls = new ArrayList(); + + public YoutubeDLResults(RandomAccessFile jsonFile) { + this.jsonFile = jsonFile; + try { + this.jsonFile.setLength(0); + this.jsonFile.seek(0); + } catch (IOException e) { + throw new RuntimeException(e); + } + } } - // read stdout in this thread, stderr in separate thread - // see https://github.com/internetarchive/heritrix3/pull/257/files#r279990349 - protected ProcessOutput readOutput(Process proc) throws IOException { - ProcessOutput output = new ProcessOutput(); + /** + * Copies stream to RandomAccessFile out as it is read. + */ + static class TeedInputStream extends InputStream { + private InputStream in; + private RandomAccessFile out; - Reader err = new InputStreamReader(proc.getErrorStream(), "UTF-8"); - InputStreamReader out = new InputStreamReader(proc.getInputStream(), "UTF-8"); - ExecutorService threadPool = Executors.newSingleThreadExecutor(); + public TeedInputStream(InputStream in, RandomAccessFile out) { + this.in = in; + this.out = out; + } - Future future = threadPool.submit(new Callable() { - @Override - public String call() throws IOException { - return readToEnd(err); + @Override + public int read() throws IOException { + int b = in.read(); + if (b >= 0) { + out.write(b); } - }); + return b; + } - output.stdout = readToEnd(out); + @Override + public int read(byte b[], int off, int len) throws IOException { + int n = in.read(b, off, len); + if (n > 0) { + out.write(b, off, n); + } + return n; + } - try { - output.stderr = future.get(); - } catch (InterruptedException e) { - throw new IOException(e); // :shrug: - } catch (ExecutionException e) { - if (e.getCause() instanceof IOException) { - throw (IOException) e.getCause(); - } else { - throw new IOException(e); + @Override + public byte[] readNBytes(int len) throws IOException { + byte[] buf = in.readNBytes(len); + out.write(buf); + return buf; + } + + @Override + public int readNBytes(byte[] b, int off, int len) throws IOException { + int n = in.readNBytes(b, off, len); + if (n > 0) { + out.write(b, off, n); } - } finally { - threadPool.shutdown(); + return n; } + } - return output; + /** + * Streams through youtube-dl json output. Sticks video urls in + * results.videoUrls, web page urls in + * results.pageUrls, and saves the json in anonymous temp file + * results.jsonFile. + */ + protected void streamYdlOutput(InputStream in, YoutubeDLResults results) throws IOException { + TeedInputStream tee = new TeedInputStream(in, results.jsonFile); + try (JsonReader jsonReader = new JsonReader(new InputStreamReader(tee, "UTF-8"))) { + while (true) { + JsonToken nextToken = jsonReader.peek(); + switch (nextToken) { + case BEGIN_ARRAY: + jsonReader.beginArray(); + break; + case BEGIN_OBJECT: + jsonReader.beginObject(); + break; + case BOOLEAN: + jsonReader.nextBoolean(); + break; + case END_ARRAY: + jsonReader.endArray(); + break; + case END_DOCUMENT: + return; + case END_OBJECT: + jsonReader.endObject(); + break; + case NAME: + jsonReader.nextName(); + break; + case NULL: + jsonReader.nextNull(); + break; + case NUMBER: + jsonReader.nextString(); + break; + case STRING: + String value = jsonReader.nextString(); + if ("$.url".equals(jsonReader.getPath()) + || jsonReader.getPath().matches("^\\$\\.entries\\[\\d+\\]\\.url$")) { + results.videoUrls.add(value); + } else if ("$.webpage_url".equals(jsonReader.getPath()) + || jsonReader.getPath().matches("^\\$\\.entries\\[\\d+\\]\\.webpage_url$")) { + results.pageUrls.add(value); + } + break; + default: + throw new RuntimeException("unexpected json token " + nextToken); + } + } + } } - protected JSONObject runYoutubeDL(CrawlURI uri) { + /** + * Writes output to this.tempFile.get(). + * + * Reads stdout in this thread, stderr in separate thread. + * see https://github.com/internetarchive/heritrix3/pull/257/files#r279990349 + */ + protected YoutubeDLResults runYoutubeDL(CrawlURI uri) { /* * --format=best * @@ -354,7 +437,7 @@ protected JSONObject runYoutubeDL(CrawlURI uri) { ProcessBuilder pb = new ProcessBuilder("youtube-dl", "--ignore-config", "--simulate", "--dump-single-json", "--format=best", "--playlist-end=" + MAX_VIDEOS_PER_PAGE, uri.toString()); - logger.fine("running " + pb.command()); + logger.info("running: " + String.join(" ", pb.command())); Process proc = null; try { @@ -364,47 +447,54 @@ protected JSONObject runYoutubeDL(CrawlURI uri) { return null; } - ProcessOutput output; + Reader err; try { - output = readOutput(proc); - } catch (IOException e) { - logger.log(Level.WARNING, - "problem reading output from youtube-dl " + pb.command(), - e); - return null; + err = new InputStreamReader(proc.getErrorStream(), "UTF-8"); + } catch (UnsupportedEncodingException e) { + throw new RuntimeException(e); } + ExecutorService threadPool = Executors.newSingleThreadExecutor(); - try { - if (proc.waitFor() != 0) { - /* - * youtube-dl is noisy when it fails to find a video. I guess - * the assumption is that you're running it on pages you know - * have videos. We could be hiding real errors in some cases - * but it's just too much noise to log this at WARNING level. - */ - logger.fine("youtube-dl exited with status " - + proc.waitFor() + " " + pb.command() - + "\n=== stdout ===\n" + output.stdout - + "\n=== stderr ===\n" + output.stderr); - return null; + Future future = threadPool.submit(new Callable() { + @Override + public String call() throws IOException { + return readToEnd(err); } - } catch (InterruptedException e) { - proc.destroyForcibly(); - } + }); + + YoutubeDLResults results = new YoutubeDLResults(tempfile.get()); try { - JSONObject ydlJson = new JSONObject(output.stdout); - return ydlJson; - } catch (JSONException e) { - // sometimes we get no output at all from youtube-dl, which - // manifests as a JsonIOException - logger.log(Level.FINE, - "problem parsing json from youtube-dl " + pb.command() - + "\n=== stdout ===\n" + output.stdout - + "\n=== stderr ===\n" + output.stderr, + try { + streamYdlOutput(proc.getInputStream(), results); + } catch (EOFException e) { + try { + // this happens when there was no json output, which means no videos + // were found, totally normal + logger.log(Level.FINE, "problem parsing json from youtube-dl " + pb.command() + " " + future.get()); + } catch (InterruptedException e1) { + throw new IOException(e1); + } catch (ExecutionException e1) { + throw new IOException(e1); + } + } + } catch (IOException e) { + logger.log(Level.WARNING, + "problem reading output from youtube-dl " + pb.command(), e); return null; + } finally { + try { + // the process should already have completed + proc.waitFor(1, TimeUnit.SECONDS); + } catch (InterruptedException e) { + logger.warning("killing pid " + proc.pid()); + proc.destroyForcibly(); + } + threadPool.shutdown(); } + + return results; } @Override @@ -446,8 +536,11 @@ protected boolean shouldExtract(CrawlURI uri) { } @Override - public boolean shouldBuildRecord(CrawlURI curi) { - return curi.containsDataKey("ydlJson"); + public boolean shouldBuildRecord(CrawlURI uri) { + // should build record for containing page, which has an + // annotation like "youtube-dl:3" (no slash) + String annotation = findYdlAnnotation(uri); + return annotation != null && !annotation.contains("/"); } @Override @@ -468,13 +561,64 @@ public WARCRecordInfo buildRecord(CrawlURI curi, URI concurrentTo) recordInfo.setMimetype("application/vnd.youtube-dl_formats+json;charset=utf-8"); recordInfo.setEnforceLength(true); - JSONObject ydlJson = (JSONObject) curi.getData().get("ydlJson"); - String ydlJsonString = ydlJson.toString(1); + tempfile.get().seek(0); + InputStream inputStream = Channels.newInputStream(tempfile.get().getChannel()); + recordInfo.setContentStream(inputStream); + recordInfo.setContentLength(tempfile.get().length()); - byte[] b = ydlJsonString.getBytes("UTF-8"); - recordInfo.setContentStream(new ByteArrayInputStream(b)); - recordInfo.setContentLength((long) b.length); + logger.info("built record timestamp=" + timestamp + " url=" + recordInfo.getUrl()); return recordInfo; } + + public static void main(String[] args) throws IOException { + /* + File t = File.createTempFile("ydl", ".json"); + try (RandomAccessFile f = new RandomAccessFile(t, "rw")) { + t.delete(); + f.write("hello!\n".getBytes()); + System.out.println("length: " + f.length()); + System.out.println("tell: " + f.getFilePointer()); + f.seek(0); + System.out.println("tell: " + f.getFilePointer()); + String l = f.readLine(); + System.out.println("read line: " + l); + System.out.println("tell: " + f.getFilePointer()); + } + */ + + ExtractorYoutubeDL e = new ExtractorYoutubeDL(); + + FileInputStream in = new FileInputStream("/tmp/ydl-single-video.json"); + YoutubeDLResults results = new YoutubeDLResults(e.tempfile.get()); + e.streamYdlOutput(in, results); + System.out.println("video urls: " + results.videoUrls); + System.out.println("page urls: " + results.pageUrls); + + results.jsonFile.seek(0); + byte[] buf = new byte[4096]; + while (true) { + int n = results.jsonFile.read(buf); + if (n < 0) { + break; + } + System.out.write(buf, 0, n); + } + + in = new FileInputStream("/tmp/ydl-uncgreensboro-limited.json"); + results = new YoutubeDLResults(e.tempfile.get()); + e.streamYdlOutput(in, results); + System.out.println("video urls: " + results.videoUrls); + System.out.println("page urls: " + results.pageUrls); + + results.jsonFile.seek(0); + while (true) { + int n = results.jsonFile.read(buf); + if (n < 0) { + break; + } + System.out.write(buf, 0, n); + } + + } } From 6667186826fdb6259ec6a9cfd57b878ed416c38a Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 22 Jan 2020 14:12:36 -0800 Subject: [PATCH 064/123] java 8 compatibility --- .../modules/extractor/ExtractorYoutubeDL.java | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) diff --git a/contrib/src/main/java/org/archive/modules/extractor/ExtractorYoutubeDL.java b/contrib/src/main/java/org/archive/modules/extractor/ExtractorYoutubeDL.java index 81b9a1d59..661e06691 100644 --- a/contrib/src/main/java/org/archive/modules/extractor/ExtractorYoutubeDL.java +++ b/contrib/src/main/java/org/archive/modules/extractor/ExtractorYoutubeDL.java @@ -347,22 +347,6 @@ public int read(byte b[], int off, int len) throws IOException { } return n; } - - @Override - public byte[] readNBytes(int len) throws IOException { - byte[] buf = in.readNBytes(len); - out.write(buf); - return buf; - } - - @Override - public int readNBytes(byte[] b, int off, int len) throws IOException { - int n = in.readNBytes(b, off, len); - if (n > 0) { - out.write(b, off, n); - } - return n; - } } /** @@ -488,7 +472,7 @@ public String call() throws IOException { // the process should already have completed proc.waitFor(1, TimeUnit.SECONDS); } catch (InterruptedException e) { - logger.warning("killing pid " + proc.pid()); + logger.warning("youtube-dl still running? killing it"); proc.destroyForcibly(); } threadPool.shutdown(); From 33458f1518843a359b10e9731b456be40969c425 Mon Sep 17 00:00:00 2001 From: Tim Hennekey Date: Wed, 22 Jan 2020 17:13:58 -0500 Subject: [PATCH 065/123] Fix assertions By using assertEquals and seting the expected and actual values, the failure messages become a bit more useful. --- .../org/archive/crawler/util/BloomUriUniqFilterTest.java | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/engine/src/test/java/org/archive/crawler/util/BloomUriUniqFilterTest.java b/engine/src/test/java/org/archive/crawler/util/BloomUriUniqFilterTest.java index 28390704e..81e3c1a51 100644 --- a/engine/src/test/java/org/archive/crawler/util/BloomUriUniqFilterTest.java +++ b/engine/src/test/java/org/archive/crawler/util/BloomUriUniqFilterTest.java @@ -67,7 +67,7 @@ public void testAdding() throws URIException { this.filter.addForce(this.getUri(), new CrawlURI(UURIFactory.getInstance(this.getUri()))); // Should only have add 'this' once. - assertTrue("Count is off", this.filter.count() == 1); + assertEquals("Count is off", 1, this.filter.count()); } /** @@ -104,8 +104,7 @@ public void testWriting() throws URIException { logger.fine("Readded subset " + list.size() + " in " + (System.currentTimeMillis() - start)); - assertTrue("Count is off: " + filter.count(), - filter.count() == MAX_COUNT); + assertEquals("Count is off", MAX_COUNT, filter.count()); } public void testNote() { From 9cb9563da3add0ef18fa0811554b936e0a4dde66 Mon Sep 17 00:00:00 2001 From: Tim Hennekey Date: Wed, 22 Jan 2020 17:15:24 -0500 Subject: [PATCH 066/123] Increment the count only when the filter notes it Otherwise this is a count of how many times this add method is called, not how many times an element was noted as being actually added. --- .../src/main/java/org/archive/util/BloomFilter64bit.java | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/commons/src/main/java/org/archive/util/BloomFilter64bit.java b/commons/src/main/java/org/archive/util/BloomFilter64bit.java index b2ba81b6e..9c048a595 100644 --- a/commons/src/main/java/org/archive/util/BloomFilter64bit.java +++ b/commons/src/main/java/org/archive/util/BloomFilter64bit.java @@ -129,8 +129,11 @@ public boolean contains( final CharSequence s ) { */ public boolean add( final CharSequence s ) { - size++; - return delegate.put(s); + boolean added = delegate.put(s); + if (added) { + size++; + } + return added; } /* (non-Javadoc) From 3003a8e751c39e9bea874c09cca5300330d16b2f Mon Sep 17 00:00:00 2001 From: Tim Hennekey Date: Fri, 24 Jan 2020 12:28:11 -0500 Subject: [PATCH 067/123] Replace constant with accessor methods CrawlURI already had the accessor method, and the use of the constant was a bit inconsistent. This change adds the corresponding mutator method to make working with the CrawlURI history a bit simpler. --- .../org/archive/modules/recrawl/FetchHistoryHelper.java | 5 ++--- .../modules/recrawl/wbm/WbmPersistLoadProcessorTest.java | 8 +++----- modules/src/main/java/org/archive/modules/CrawlURI.java | 7 ++++++- .../archive/modules/recrawl/FetchHistoryProcessor.java | 3 +-- .../modules/recrawl/RecrawlAttributeConstants.java | 9 +++++++-- 5 files changed, 19 insertions(+), 13 deletions(-) diff --git a/contrib/src/main/java/org/archive/modules/recrawl/FetchHistoryHelper.java b/contrib/src/main/java/org/archive/modules/recrawl/FetchHistoryHelper.java index 4705a9f65..a4807a526 100644 --- a/contrib/src/main/java/org/archive/modules/recrawl/FetchHistoryHelper.java +++ b/contrib/src/main/java/org/archive/modules/recrawl/FetchHistoryHelper.java @@ -53,13 +53,12 @@ public class FetchHistoryHelper { */ @SuppressWarnings("unchecked") public static Map getFetchHistory(CrawlURI uri, long timestamp, int historyLength) { - Map data = uri.getData(); - Map[] history = (Map[])data.get(RecrawlAttributeConstants.A_FETCH_HISTORY); + Map[] history = uri.getFetchHistory(); if (history == null) { // there's no history records at all. // FetchHistoryProcessor assumes history is HashMap[], not Map[]. history = new HashMap[historyLength]; - data.put(RecrawlAttributeConstants.A_FETCH_HISTORY, history); + uri.setFetchHistory(history); } for (int i = 0; i < history.length; i++) { if (history[i] == null) { diff --git a/contrib/src/test/java/org/archive/modules/recrawl/wbm/WbmPersistLoadProcessorTest.java b/contrib/src/test/java/org/archive/modules/recrawl/wbm/WbmPersistLoadProcessorTest.java index 88d1c0cf9..ab16e584b 100644 --- a/contrib/src/test/java/org/archive/modules/recrawl/wbm/WbmPersistLoadProcessorTest.java +++ b/contrib/src/test/java/org/archive/modules/recrawl/wbm/WbmPersistLoadProcessorTest.java @@ -80,9 +80,7 @@ public TestNormalHttpResponse() { } protected Map getFetchHistory(CrawlURI curi, int idx) { - Map data = curi.getData(); - @SuppressWarnings("unchecked") - Map[] historyArray = (Map[])data.get(RecrawlAttributeConstants.A_FETCH_HISTORY); + Map[] historyArray = curi.getFetchHistory(); assertNotNull(historyArray); Map history = historyArray[idx]; return history; @@ -114,10 +112,10 @@ public void _testInnerProcessResultSingleShotWithMock() throws Exception { // put history entry newer than being loaded (i.e. loaded history entry will not be used for FetchHistoryProcessor // check below. long expected_ts = DateUtils.parse14DigitDate(TestNormalHttpResponse.EXPECTED_TS).getTime(); - Map[] fetchHistory = (Map[])curi.getData().get(RecrawlAttributeConstants.A_FETCH_HISTORY); + Map[] fetchHistory = curi.getFetchHistory(); if (fetchHistory == null) { fetchHistory = new HashMap[2]; - curi.getData().put(RecrawlAttributeConstants.A_FETCH_HISTORY, fetchHistory); + curi.setFetchHistory(fetchHistory); } final byte[] digestValue0 = sha1Digest("0"); final byte[] digestValue1 = sha1Digest("1"); diff --git a/modules/src/main/java/org/archive/modules/CrawlURI.java b/modules/src/main/java/org/archive/modules/CrawlURI.java index 92d2870f9..828636989 100644 --- a/modules/src/main/java/org/archive/modules/CrawlURI.java +++ b/modules/src/main/java/org/archive/modules/CrawlURI.java @@ -57,7 +57,6 @@ import static org.archive.modules.fetcher.FetchStatusCodes.S_UNATTEMPTED; import static org.archive.modules.fetcher.FetchStatusCodes.S_UNFETCHABLE_URI; import static org.archive.modules.recrawl.RecrawlAttributeConstants.A_CONTENT_DIGEST_HISTORY; -import static org.archive.modules.recrawl.RecrawlAttributeConstants.A_FETCH_HISTORY; import java.io.IOException; import java.io.ObjectInputStream; @@ -122,6 +121,8 @@ public class CrawlURI Logger.getLogger(CrawlURI.class.getName()); public static final int UNCALCULATED = -1; + /** fetch history array */ + public static final String A_FETCH_HISTORY = "fetch-history"; public static enum FetchType { HTTP_GET, HTTP_POST, UNKNOWN }; @@ -1826,6 +1827,10 @@ public void setHttpAuthChallenges(Map httpAuthChallenges) { public HashMap[] getFetchHistory() { return (HashMap[]) getData().get(A_FETCH_HISTORY); } + + public void setFetchHistory(Map[] history) { + getData().put(A_FETCH_HISTORY, history); + } public HashMap getContentDigestHistory() { @SuppressWarnings("unchecked") diff --git a/modules/src/main/java/org/archive/modules/recrawl/FetchHistoryProcessor.java b/modules/src/main/java/org/archive/modules/recrawl/FetchHistoryProcessor.java index 889fa05c9..2f4dce68d 100644 --- a/modules/src/main/java/org/archive/modules/recrawl/FetchHistoryProcessor.java +++ b/modules/src/main/java/org/archive/modules/recrawl/FetchHistoryProcessor.java @@ -22,7 +22,6 @@ import static org.archive.modules.CoreAttributeConstants.A_FETCH_BEGAN_TIME; import static org.archive.modules.recrawl.RecrawlAttributeConstants.A_CONTENT_DIGEST; import static org.archive.modules.recrawl.RecrawlAttributeConstants.A_ETAG_HEADER; -import static org.archive.modules.recrawl.RecrawlAttributeConstants.A_FETCH_HISTORY; import static org.archive.modules.recrawl.RecrawlAttributeConstants.A_LAST_MODIFIED_HEADER; import static org.archive.modules.recrawl.RecrawlAttributeConstants.A_REFERENCE_LENGTH; import static org.archive.modules.recrawl.RecrawlAttributeConstants.A_STATUS; @@ -103,7 +102,7 @@ protected void innerProcess(CrawlURI puri) throws InterruptedException { } history[0] = latestFetch; - curi.getData().put(A_FETCH_HISTORY, history); + curi.setFetchHistory(history); if (curi.getFetchStatus() == 304) { if( history.length >= 2 && history[1] != null && history[1].containsKey(A_CONTENT_DIGEST)) { diff --git a/modules/src/main/java/org/archive/modules/recrawl/RecrawlAttributeConstants.java b/modules/src/main/java/org/archive/modules/recrawl/RecrawlAttributeConstants.java index e7c41dcef..f284318d4 100644 --- a/modules/src/main/java/org/archive/modules/recrawl/RecrawlAttributeConstants.java +++ b/modules/src/main/java/org/archive/modules/recrawl/RecrawlAttributeConstants.java @@ -19,6 +19,8 @@ package org.archive.modules.recrawl; +import org.archive.modules.CrawlURI; + /** * * @author pjack @@ -28,8 +30,11 @@ public interface RecrawlAttributeConstants { /* Duplication-reduction / recrawl / history constants */ - /** fetch history array */ - public static final String A_FETCH_HISTORY = "fetch-history"; + /** + * @deprecated Please use {@link org.archive.modules.CrawlURI#getFetchHistory()} and {@link org.archive.modules.CrawlURI#setFetchHistory(java.util.Map[])} + */ + @Deprecated + public static final String A_FETCH_HISTORY = CrawlURI.A_FETCH_HISTORY; /** content digest */ public static final String A_CONTENT_DIGEST = "content-digest"; /** header name (and AList key) for last-modified timestamp */ From b5f95c5e068993ecc5a2dd368cc1c467fe0d5748 Mon Sep 17 00:00:00 2001 From: Tim Hennekey Date: Fri, 24 Jan 2020 12:46:37 -0500 Subject: [PATCH 068/123] Replace custom Base32 encoding Guava is available so a custom implementation is unnecessary. --- .../main/java/org/archive/util/Base32.java | 139 ++---------------- 1 file changed, 11 insertions(+), 128 deletions(-) diff --git a/commons/src/main/java/org/archive/util/Base32.java b/commons/src/main/java/org/archive/util/Base32.java index addfd11e1..92f18d659 100644 --- a/commons/src/main/java/org/archive/util/Base32.java +++ b/commons/src/main/java/org/archive/util/Base32.java @@ -18,142 +18,25 @@ */ package org.archive.util; +import com.google.common.io.BaseEncoding; + /** - * Base32 - encodes and decodes RFC3548 Base32 - * (see http://www.faqs.org/rfcs/rfc3548.html ) - * - * Imported public-domain code of Bitzi. - * - * @author Robert Kaye - * @author Gordon Mohr + * @deprecated Use {@link com.google.common.io.BaseEncoding#base32()} */ +@Deprecated public class Base32 { - private static final String base32Chars = - "ABCDEFGHIJKLMNOPQRSTUVWXYZ234567"; - private static final int[] base32Lookup = - { 0xFF,0xFF,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F, // '0', '1', '2', '3', '4', '5', '6', '7' - 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, // '8', '9', ':', ';', '<', '=', '>', '?' - 0xFF,0x00,0x01,0x02,0x03,0x04,0x05,0x06, // '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G' - 0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E, // 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O' - 0x0F,0x10,0x11,0x12,0x13,0x14,0x15,0x16, // 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W' - 0x17,0x18,0x19,0xFF,0xFF,0xFF,0xFF,0xFF, // 'X', 'Y', 'Z', '[', '\', ']', '^', '_' - 0xFF,0x00,0x01,0x02,0x03,0x04,0x05,0x06, // '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g' - 0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E, // 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o' - 0x0F,0x10,0x11,0x12,0x13,0x14,0x15,0x16, // 'p', 'q', 'r', 's', 't', 'u', 'v', 'w' - 0x17,0x18,0x19,0xFF,0xFF,0xFF,0xFF,0xFF // 'x', 'y', 'z', '{', '|', '}', '~', 'DEL' - }; - /** - * Encodes byte array to Base32 String. - * - * @param bytes Bytes to encode. - * @return Encoded byte array bytes as a String. - * + * @deprecated Use {@link com.google.common.io.BaseEncoding#base32()} */ + @Deprecated static public String encode(final byte[] bytes) { - int i = 0, index = 0, digit = 0; - int currByte, nextByte; - StringBuffer base32 = new StringBuffer((bytes.length + 7) * 8 / 5); - - while (i < bytes.length) { - currByte = (bytes[i] >= 0) ? bytes[i] : (bytes[i] + 256); // unsign - - /* Is the current digit going to span a byte boundary? */ - if (index > 3) { - if ((i + 1) < bytes.length) { - nextByte = - (bytes[i + 1] >= 0) ? bytes[i + 1] : (bytes[i + 1] + 256); - } else { - nextByte = 0; - } - - digit = currByte & (0xFF >> index); - index = (index + 5) % 8; - digit <<= index; - digit |= nextByte >> (8 - index); - i++; - } else { - digit = (currByte >> (8 - (index + 5))) & 0x1F; - index = (index + 5) % 8; - if (index == 0) - i++; - } - base32.append(base32Chars.charAt(digit)); - } - - return base32.toString(); + return BaseEncoding.base32().encode(bytes); } - /** - * Decodes the given Base32 String to a raw byte array. - * - * @param base32 - * @return Decoded base32 String as a raw byte array. - */ - static public byte[] decode(final String base32) { - int i, index, lookup, offset, digit; - byte[] bytes = new byte[base32.length() * 5 / 8]; - - for (i = 0, index = 0, offset = 0; i < base32.length(); i++) { - lookup = base32.charAt(i) - '0'; - - /* Skip chars outside the lookup table */ - if (lookup < 0 || lookup >= base32Lookup.length) { - continue; - } - - digit = base32Lookup[lookup]; - - /* If this digit is not in the table, ignore it */ - if (digit == 0xFF) { - continue; - } - - if (index <= 3) { - index = (index + 5) % 8; - if (index == 0) { - bytes[offset] |= digit; - offset++; - if (offset >= bytes.length) - break; - } else { - bytes[offset] |= digit << (8 - index); - } - } else { - index = (index + 5) % 8; - bytes[offset] |= (digit >>> index); - offset++; - - if (offset >= bytes.length) { - break; - } - bytes[offset] |= digit << (8 - index); - } - } - return bytes; - } - - /** For testing, take a command-line argument in Base32, decode, print in hex, - * encode, print - * - * @param args + * @deprecated Use {@link com.google.common.io.BaseEncoding#base32()} */ - static public void main(String[] args) { - if (args.length == 0) { - System.out.println("Supply a Base32-encoded argument."); - return; - } - System.out.println(" Original: " + args[0]); - byte[] decoded = Base32.decode(args[0]); - System.out.print(" Hex: "); - for (int i = 0; i < decoded.length; i++) { - int b = decoded[i]; - if (b < 0) { - b += 256; - } - System.out.print((Integer.toHexString(b + 256)).substring(1)); - } - System.out.println(); - System.out.println("Reencoded: " + Base32.encode(decoded)); + @Deprecated + static public byte[] decode(final String base32) { + return BaseEncoding.base32().decode(base32); } } From a4ce59dbd6aab4fe695aac8c8ccde6ad777aebc2 Mon Sep 17 00:00:00 2001 From: Alex Osborne Date: Sat, 25 Jan 2020 08:25:59 +0900 Subject: [PATCH 069/123] Fix stream closed exception by not closing output stream ServerCall.writeResponseBody() flushes it after we return so it must remain open. Fixes #305 --- .../java/org/archive/crawler/restlet/EditRepresentation.java | 1 - 1 file changed, 1 deletion(-) diff --git a/engine/src/main/java/org/archive/crawler/restlet/EditRepresentation.java b/engine/src/main/java/org/archive/crawler/restlet/EditRepresentation.java index 232dbdd2d..76beedc9f 100644 --- a/engine/src/main/java/org/archive/crawler/restlet/EditRepresentation.java +++ b/engine/src/main/java/org/archive/crawler/restlet/EditRepresentation.java @@ -109,7 +109,6 @@ public void write(Writer writer) throws IOException { pw.println(""); pw.println(""); pw.println(""); - pw.close(); } public FileRepresentation getFileRepresentation() { From 54e05a7864e38ed49703c1df09e78873b670217d Mon Sep 17 00:00:00 2001 From: Tim Hennekey Date: Tue, 28 Jan 2020 15:12:54 -0500 Subject: [PATCH 070/123] Correct encoding The previous implementation appears to always have returned upper case, was able to encode either case, and did not reutrn padding. --- commons/src/main/java/org/archive/util/Base32.java | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/commons/src/main/java/org/archive/util/Base32.java b/commons/src/main/java/org/archive/util/Base32.java index 92f18d659..ad06309f7 100644 --- a/commons/src/main/java/org/archive/util/Base32.java +++ b/commons/src/main/java/org/archive/util/Base32.java @@ -30,13 +30,20 @@ public class Base32 { */ @Deprecated static public String encode(final byte[] bytes) { - return BaseEncoding.base32().encode(bytes); + return BaseEncoding.base32() + .omitPadding() + .lowerCase() + .encode(bytes) + .toUpperCase(); } /** * @deprecated Use {@link com.google.common.io.BaseEncoding#base32()} */ @Deprecated static public byte[] decode(final String base32) { - return BaseEncoding.base32().decode(base32); + return BaseEncoding.base32() + .omitPadding() + .lowerCase() + .decode(base32.toLowerCase()); } } From 8c1c8009c65ed4ffe40cac729838d279ca559c16 Mon Sep 17 00:00:00 2001 From: Lauren Ko Date: Thu, 30 Jan 2020 16:48:38 -0600 Subject: [PATCH 071/123] Fix stream closed exception for Paged view --- .../java/org/archive/crawler/restlet/PagedRepresentation.java | 1 - 1 file changed, 1 deletion(-) diff --git a/engine/src/main/java/org/archive/crawler/restlet/PagedRepresentation.java b/engine/src/main/java/org/archive/crawler/restlet/PagedRepresentation.java index aabd0d468..b032f9c12 100644 --- a/engine/src/main/java/org/archive/crawler/restlet/PagedRepresentation.java +++ b/engine/src/main/java/org/archive/crawler/restlet/PagedRepresentation.java @@ -144,7 +144,6 @@ public void write(Writer writer) throws IOException { pw.println(""); emitControls(pw); - pw.close(); } /** From bc6a15b41dd7d5b48f51b8ee5fb40884717df276 Mon Sep 17 00:00:00 2001 From: Jonathan Leitschuh Date: Tue, 11 Feb 2020 17:43:34 +0900 Subject: [PATCH 072/123] Use HTTPS to resolve dependencies in Maven Build where possible --- commons/pom.xml | 2 +- engine/pom.xml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/commons/pom.xml b/commons/pom.xml index 910d0d09e..6e6ad18c1 100644 --- a/commons/pom.xml +++ b/commons/pom.xml @@ -18,7 +18,7 @@ download.oracle.com,maven - http://download.oracle.com/maven + https://download.oracle.com/maven diff --git a/engine/pom.xml b/engine/pom.xml index 9051e9e0b..79593ec91 100644 --- a/engine/pom.xml +++ b/engine/pom.xml @@ -14,7 +14,7 @@ maven-restlet - http://maven.restlet.org + https://maven.restlet.org From 6fcc5e808a790ff9e8bf640e9f88bab7b71214ab Mon Sep 17 00:00:00 2001 From: Alex Osborne Date: Thu, 13 Feb 2020 16:59:13 +0900 Subject: [PATCH 073/123] Exclude hbase-client's guava 12 transitive dependency Guava 12 from hbase-client is closer to the root of the dependency tree than guava 17 from webarchive-commons so Maven prefers it. But recent changes to heritrix-commons rely on classes in the newer version of Guava. So let's ensure webarchive-commons wins. Hopefully this doesn't break the hbase module, I have no way of testing it. Fixes #311 --- contrib/pom.xml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/contrib/pom.xml b/contrib/pom.xml index 2667cbcda..b270574ec 100644 --- a/contrib/pom.xml +++ b/contrib/pom.xml @@ -33,6 +33,11 @@ jdk.tools jdk.tools + + + com.google.guava + guava + From 60e40531802ca84eef89b6c6fcd04e5540adc390 Mon Sep 17 00:00:00 2001 From: Colin Rosenthal Date: Tue, 3 Mar 2020 08:52:09 +0100 Subject: [PATCH 074/123] Commented this test back in to make Travis happy --- .../modules/deciderules/MatchesListRegexDecideRuleTest.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/modules/src/test/java/org/archive/modules/deciderules/MatchesListRegexDecideRuleTest.java b/modules/src/test/java/org/archive/modules/deciderules/MatchesListRegexDecideRuleTest.java index e70195007..eeea404f3 100644 --- a/modules/src/test/java/org/archive/modules/deciderules/MatchesListRegexDecideRuleTest.java +++ b/modules/src/test/java/org/archive/modules/deciderules/MatchesListRegexDecideRuleTest.java @@ -1,5 +1,6 @@ package org.archive.modules.deciderules; +import com.google.common.annotations.VisibleForTesting; import junit.framework.TestCase; import org.apache.commons.httpclient.URIException; import org.archive.modules.CrawlURI; @@ -16,7 +17,7 @@ public class MatchesListRegexDecideRuleTest extends TestCase { * will never return. * @throws URIException */ - public void xtestEvaluate() throws URIException { + public void testEvaluate() throws URIException { final String regex = "http://www\\.netarkivet\\.dk/((x+x+)+)y"; String seed = "http://www.netarkivet.dk/xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"; MatchesListRegexDecideRule rule = new MatchesListRegexDecideRule(); From e54258756a10af47b8ae5ce40ff65c21ef3da835 Mon Sep 17 00:00:00 2001 From: Colin Rosenthal Date: Tue, 3 Mar 2020 09:02:07 +0100 Subject: [PATCH 075/123] Fixed regex timeout handling following suggestion https://github.com/internetarchive/heritrix3/pull/290#discussion_r366711640 --- .../MatchesListRegexDecideRule.java | 34 +++++++++++++------ 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/modules/src/main/java/org/archive/modules/deciderules/MatchesListRegexDecideRule.java b/modules/src/main/java/org/archive/modules/deciderules/MatchesListRegexDecideRule.java index 2aedcc23d..fc18384f0 100644 --- a/modules/src/main/java/org/archive/modules/deciderules/MatchesListRegexDecideRule.java +++ b/modules/src/main/java/org/archive/modules/deciderules/MatchesListRegexDecideRule.java @@ -1,8 +1,8 @@ /* * This file is part of the Heritrix web crawler (crawler.archive.org). * - * Licensed to the Internet Archive (IA) by one or more individual - * contributors. + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. * * The IA licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with @@ -22,6 +22,8 @@ import java.util.List; import java.util.concurrent.CompletableFuture; import java.util.concurrent.ExecutionException; +import java.util.concurrent.ForkJoinPool; +import java.util.concurrent.FutureTask; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; import java.util.logging.Level; @@ -29,6 +31,7 @@ import java.util.regex.Pattern; import org.archive.modules.CrawlURI; +import org.archive.util.InterruptibleCharSequence; /** * Rule applies configured decision to any CrawlURIs whose String URI @@ -37,7 +40,7 @@ * The list of regular expressions can be considered logically AND or OR. * * @author Kristinn Sigurdsson - * + * * @see MatchesRegexDecideRule */ public class MatchesListRegexDecideRule extends PredicatedDecideRule { @@ -111,19 +114,28 @@ protected boolean evaluate(CrawlURI uri) { if (getTimeoutPerRegexSeconds() <= 0) { matches = p.matcher(str).matches(); } else { - CompletableFuture matchesFuture = CompletableFuture.supplyAsync(() -> p.matcher(str).matches()); + InterruptibleCharSequence interruptible = new InterruptibleCharSequence(str); + FutureTask matchesFuture = new FutureTask<>(() -> p.matcher(interruptible).matches()); + ForkJoinPool.commonPool().submit(matchesFuture); try { - matches = matchesFuture.get(getTimeoutPerRegexSeconds(), TimeUnit.SECONDS); - } catch (Exception e) { - logger.info("Exception while matching regex '" + p + "' to url '" + str + "' so assuming no match. " + e.getClass().getName()); + matchesFuture.get(getTimeoutPerRegexSeconds(), TimeUnit.SECONDS); + } catch (TimeoutException e) { + matchesFuture.cancel(true); + logger.warning("Timed out after " + getTimeoutPerRegexSeconds() + " seconds waiting for '" + p + "' to match."); + } catch (InterruptedException e) { + matchesFuture.cancel(true); + logger.warning("InterruptedException while waiting for '" + p + "' to match."); + } catch (ExecutionException e) { + matchesFuture.cancel(true); + logger.warning("ExecutionException while waiting for '" + p + "' to match: " + e.getMessage()); } } if (logger.isLoggable(Level.FINER)) { logger.finer("Tested '" + str + "' match with regex '" + - p.pattern() + " and result was " + matches); + p.pattern() + " and result was " + matches); } - + if(matches){ if(listLogicOR){ // OR based and we just got a match, done! @@ -137,12 +149,12 @@ protected boolean evaluate(CrawlURI uri) { } } } - + if (listLogicOR) { return false; } else { return true; } } - + } \ No newline at end of file From 348b5330bc21cbdae0554f3ef75c1991baf328ca Mon Sep 17 00:00:00 2001 From: Tim Hennekey Date: Wed, 4 Mar 2020 09:37:31 -0500 Subject: [PATCH 076/123] Utilize the `d` parameter There are uses of this class that necessitate a lower false positive rate than the value that was hard-coded. By making use of the parameter, as indicated in the JavaDoc, the resulting delegate should meet the demands of the caller. --- commons/src/main/java/org/archive/util/BloomFilter64bit.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/commons/src/main/java/org/archive/util/BloomFilter64bit.java b/commons/src/main/java/org/archive/util/BloomFilter64bit.java index 9c048a595..174fc8f4e 100644 --- a/commons/src/main/java/org/archive/util/BloomFilter64bit.java +++ b/commons/src/main/java/org/archive/util/BloomFilter64bit.java @@ -80,7 +80,7 @@ public BloomFilter64bit( final long n, final int d, boolean roundUp) { * @param roundUp if true, round bit size up to next-nearest-power-of-2 */ public BloomFilter64bit(final long n, final int d, Random weightsGenerator, boolean roundUp ) { - delegate = com.google.common.hash.BloomFilter.create(Funnels.unencodedCharsFunnel(), Ints.saturatedCast(n), 0.0000003); + delegate = com.google.common.hash.BloomFilter.create(Funnels.unencodedCharsFunnel(), Ints.saturatedCast(n), Math.pow(2, -d)); this.expectedInserts = n; try { Method bitSizeMethod = delegate.getClass().getDeclaredMethod("bitSize", new Class[] {}); From b4494a7ffb3fb51f3759e1ff8783eb8b3c5ce793 Mon Sep 17 00:00:00 2001 From: Andrew Jackson Date: Wed, 4 Mar 2020 15:42:29 +0000 Subject: [PATCH 077/123] Use the Wayback Machine to repair a link to Oracle docs. --- commons/src/main/java/org/archive/bdb/BdbModule.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/commons/src/main/java/org/archive/bdb/BdbModule.java b/commons/src/main/java/org/archive/bdb/BdbModule.java index d5ee2eb94..dd86405d8 100644 --- a/commons/src/main/java/org/archive/bdb/BdbModule.java +++ b/commons/src/main/java/org/archive/bdb/BdbModule.java @@ -269,7 +269,7 @@ protected void setup(File f, boolean create) config.setSharedCache(getUseSharedCache()); // we take the advice literally from... - // http://www.oracle.com/technology/products/berkeley-db/faq/je_faq.html#33 + // https://web.archive.org/web/20100727081707/http://www.oracle.com/technology/products/berkeley-db/faq/je_faq.html#33 long nLockTables = getExpectedConcurrency()-1; while(!BigInteger.valueOf(nLockTables).isProbablePrime(Integer.MAX_VALUE)) { nLockTables--; From d5fad011221b5c2312fa112596c412a9b8119432 Mon Sep 17 00:00:00 2001 From: Andrew Jackson Date: Wed, 4 Mar 2020 16:21:21 +0000 Subject: [PATCH 078/123] Re-sync of changelog prior to release. --- CHANGELOG.md | 58 +++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 57 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5d1512ae2..f831efb59 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,61 @@ # Change Log +## [Unreleased](https://github.com/internetarchive/heritrix3/tree/HEAD) + +[Full Changelog](https://github.com/internetarchive/heritrix3/compare/3.4.0-20190418...HEAD) + +**Fixed bugs:** + +- exception logged when opening/saving crawler-beans.cxml via web interface editor [\#305](https://github.com/internetarchive/heritrix3/issues/305) +- Java interface text editor error when saving crawler-beans.cxml [\#293](https://github.com/internetarchive/heritrix3/issues/293) +- Unable to upload crawler-beans.cxml with curl [\#282](https://github.com/internetarchive/heritrix3/issues/282) +- CookieStoreTest.testConcurrentLoad fails randomly [\#274](https://github.com/internetarchive/heritrix3/issues/274) + +**Closed issues:** + +- Contrib project has a maven dependency with an older version of guava library. [\#311](https://github.com/internetarchive/heritrix3/issues/311) +- BloomFilter64bitTest is slow [\#299](https://github.com/internetarchive/heritrix3/issues/299) +- ObjectIdentityBdbManualCacheTest is slow [\#297](https://github.com/internetarchive/heritrix3/issues/297) +- HTTPS console inaccessible via browser [\#279](https://github.com/internetarchive/heritrix3/issues/279) +- JDK11 support: ssl errors from console [\#275](https://github.com/internetarchive/heritrix3/issues/275) +- JDK11 support: FetchHTTPTest: ssl handshake\_failure [\#268](https://github.com/internetarchive/heritrix3/issues/268) +- JDK11 support: org.archive.util.ObjectIdentityBdbCacheTest failures [\#267](https://github.com/internetarchive/heritrix3/issues/267) +- JDK11 support: ClassNotFoundException: javax.transaction.xa.Xid [\#266](https://github.com/internetarchive/heritrix3/issues/266) +- JDK11 support: tools.jar [\#265](https://github.com/internetarchive/heritrix3/issues/265) +- JDK11 support: jaxb [\#264](https://github.com/internetarchive/heritrix3/issues/264) + +**Merged pull requests:** + +- Exclude hbase-client's guava 12 transitive dependency [\#312](https://github.com/internetarchive/heritrix3/pull/312) ([ato](https://github.com/ato)) +- Fix stream closed exception for Paged view [\#308](https://github.com/internetarchive/heritrix3/pull/308) ([ldko](https://github.com/ldko)) +- Fix stream closed exception by not closing output stream [\#306](https://github.com/internetarchive/heritrix3/pull/306) ([ato](https://github.com/ato)) +- Replace custom Base32 encoding [\#304](https://github.com/internetarchive/heritrix3/pull/304) ([hennekey](https://github.com/hennekey)) +- Replace constant with accessor methods [\#303](https://github.com/internetarchive/heritrix3/pull/303) ([hennekey](https://github.com/hennekey)) +- limit ExtractorYoutubeDL heap usage [\#302](https://github.com/internetarchive/heritrix3/pull/302) ([nlevitt](https://github.com/nlevitt)) +- fix logging config [\#301](https://github.com/internetarchive/heritrix3/pull/301) ([nlevitt](https://github.com/nlevitt)) +- Use Guice instead of custom implementation [\#300](https://github.com/internetarchive/heritrix3/pull/300) ([hennekey](https://github.com/hennekey)) +- Speed up ObjectIdentityBdbManualCacheTest [\#298](https://github.com/internetarchive/heritrix3/pull/298) ([hennekey](https://github.com/hennekey)) +- Set JUnit version to latest [\#296](https://github.com/internetarchive/heritrix3/pull/296) ([hennekey](https://github.com/hennekey)) +- Disable test that connects to wwwb-dedup.us.archive.org [\#295](https://github.com/internetarchive/heritrix3/pull/295) ([ato](https://github.com/ato)) +- Fix 'Method Not Allowed' on POST of config editor form [\#294](https://github.com/internetarchive/heritrix3/pull/294) ([ato](https://github.com/ato)) +- Crawltrap regex timeout [\#290](https://github.com/internetarchive/heritrix3/pull/290) ([csrster](https://github.com/csrster)) +- Bdb frontier access [\#289](https://github.com/internetarchive/heritrix3/pull/289) ([csrster](https://github.com/csrster)) +- Attempt to filter out embedded images. [\#288](https://github.com/internetarchive/heritrix3/pull/288) ([csrster](https://github.com/csrster)) +- change trough dedup `date` type to varchar. [\#287](https://github.com/internetarchive/heritrix3/pull/287) ([nlevitt](https://github.com/nlevitt)) +- Add support for forced queue assignment and parallel queues [\#286](https://github.com/internetarchive/heritrix3/pull/286) ([adam-miller](https://github.com/adam-miller)) +- Warc writer chain [\#285](https://github.com/internetarchive/heritrix3/pull/285) ([nlevitt](https://github.com/nlevitt)) +- Fix jobdir PUT [\#283](https://github.com/internetarchive/heritrix3/pull/283) ([ato](https://github.com/ato)) +- WIP: Upgrade BDB JE to version 7.5.11 [\#281](https://github.com/internetarchive/heritrix3/pull/281) ([anjackson](https://github.com/anjackson)) +- Mitigate random CookieStore.testConcurrentLoad test failures [\#280](https://github.com/internetarchive/heritrix3/pull/280) ([ato](https://github.com/ato)) +- JDK11 support: upgrade to Jetty 9.4.19, Restlet 2.4.0 and drop JDK 7 support [\#276](https://github.com/internetarchive/heritrix3/pull/276) ([ato](https://github.com/ato)) +- JDK11 support: remove unused class ObjectIdentityBdbCache and tests [\#273](https://github.com/internetarchive/heritrix3/pull/273) ([ato](https://github.com/ato)) +- JDK11 support: upgrade maven-surefire-plugin to 2.22.2 [\#272](https://github.com/internetarchive/heritrix3/pull/272) ([ato](https://github.com/ato)) +- JDK11 support: exclude tools.jar from hbase-client dependency [\#271](https://github.com/internetarchive/heritrix3/pull/271) ([ato](https://github.com/ato)) +- Travis fixes [\#270](https://github.com/internetarchive/heritrix3/pull/270) ([ato](https://github.com/ato)) +- JDK11 support: explicitly depend on JAXB [\#269](https://github.com/internetarchive/heritrix3/pull/269) ([ato](https://github.com/ato)) +- WIP: ExtractorYoutubeDL [\#257](https://github.com/internetarchive/heritrix3/pull/257) ([nlevitt](https://github.com/nlevitt)) +- Update README and add LICENSE.txt [\#256](https://github.com/internetarchive/heritrix3/pull/256) ([ruebot](https://github.com/ruebot)) + ## [3.4.0-20190418](https://github.com/internetarchive/heritrix3/tree/3.4.0-20190418) (2019-04-18) [Full Changelog](https://github.com/internetarchive/heritrix3/compare/3.4.0-20190207...3.4.0-20190418) @@ -29,6 +85,7 @@ - Trough dedup [\#242](https://github.com/internetarchive/heritrix3/pull/242) ([nlevitt](https://github.com/nlevitt)) - Ensure we start parsing full lines, for \#239. [\#240](https://github.com/internetarchive/heritrix3/pull/240) ([anjackson](https://github.com/anjackson)) - Add CHANGELOG; address \#233. [\#238](https://github.com/internetarchive/heritrix3/pull/238) ([ruebot](https://github.com/ruebot)) +- fix for test failures in a workspace on NFS-mounted filesystem [\#196](https://github.com/internetarchive/heritrix3/pull/196) ([kngenie](https://github.com/kngenie)) ## [3.4.0-20190207](https://github.com/internetarchive/heritrix3/tree/3.4.0-20190207) (2019-02-07) [Full Changelog](https://github.com/internetarchive/heritrix3/compare/3.4.0-20190205...3.4.0-20190207) @@ -84,7 +141,6 @@ - Fix link to User Guide [\#207](https://github.com/internetarchive/heritrix3/pull/207) ([maurice-schleussinger](https://github.com/maurice-schleussinger)) - Add parameter to allow even distribution for parallel queues. [\#205](https://github.com/internetarchive/heritrix3/pull/205) ([adam-miller](https://github.com/adam-miller)) - catch exceptions scoping outlinks to stop them from derailing process… [\#197](https://github.com/internetarchive/heritrix3/pull/197) ([nlevitt](https://github.com/nlevitt)) -- fix for test failures in a workspace on NFS-mounted filesystem [\#196](https://github.com/internetarchive/heritrix3/pull/196) ([kngenie](https://github.com/kngenie)) - limit max size of form input [\#194](https://github.com/internetarchive/heritrix3/pull/194) ([galgeek](https://github.com/galgeek)) - Enforce robots.txt character limit per char not per line [\#192](https://github.com/internetarchive/heritrix3/pull/192) ([ato](https://github.com/ato)) - Allow JavaDNS to be disabled as part of resolving outstanding build and test issues [\#190](https://github.com/internetarchive/heritrix3/pull/190) ([anjackson](https://github.com/anjackson)) From 384da2e82f58236646fd64472ec53179513a530d Mon Sep 17 00:00:00 2001 From: Andrew Jackson Date: Wed, 4 Mar 2020 23:03:47 +0000 Subject: [PATCH 079/123] Disable doclint --- pom.xml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pom.xml b/pom.xml index 77255c67f..2a6b13874 100644 --- a/pom.xml +++ b/pom.xml @@ -409,6 +409,8 @@ http://maven.apache.org/guides/mini/guide-m1-m2.html UTF-8 ${maven.build.timestamp} + none + -Xdoclint:none From a1bdcb1be73a76b0a4811c3a1df76175dc1a3d11 Mon Sep 17 00:00:00 2001 From: Andrew Jackson Date: Wed, 4 Mar 2020 23:05:33 +0000 Subject: [PATCH 080/123] [maven-release-plugin] prepare release 3.4.0-20200304 --- commons/pom.xml | 2 +- contrib/pom.xml | 2 +- dist/pom.xml | 2 +- engine/pom.xml | 2 +- modules/pom.xml | 2 +- pom.xml | 4 ++-- 6 files changed, 7 insertions(+), 7 deletions(-) diff --git a/commons/pom.xml b/commons/pom.xml index e383a0779..cf295b7a1 100644 --- a/commons/pom.xml +++ b/commons/pom.xml @@ -3,7 +3,7 @@ org.archive heritrix - 3.4.0-SNAPSHOT + 3.4.0-20200304 4.0.0 org.archive.heritrix diff --git a/contrib/pom.xml b/contrib/pom.xml index b270574ec..b755d0639 100644 --- a/contrib/pom.xml +++ b/contrib/pom.xml @@ -3,7 +3,7 @@ org.archive heritrix - 3.4.0-SNAPSHOT + 3.4.0-20200304 org.archive.heritrix heritrix-contrib diff --git a/dist/pom.xml b/dist/pom.xml index 9a7ddc077..0b7fb8526 100644 --- a/dist/pom.xml +++ b/dist/pom.xml @@ -3,7 +3,7 @@ org.archive heritrix - 3.4.0-SNAPSHOT + 3.4.0-20200304 4.0.0 org.archive.heritrix diff --git a/engine/pom.xml b/engine/pom.xml index 79593ec91..f6724b55d 100644 --- a/engine/pom.xml +++ b/engine/pom.xml @@ -3,7 +3,7 @@ org.archive heritrix - 3.4.0-SNAPSHOT + 3.4.0-20200304 4.0.0 org.archive.heritrix diff --git a/modules/pom.xml b/modules/pom.xml index ff3816aed..4e6350611 100644 --- a/modules/pom.xml +++ b/modules/pom.xml @@ -3,7 +3,7 @@ org.archive heritrix - 3.4.0-SNAPSHOT + 3.4.0-20200304 4.0.0 org.archive.heritrix diff --git a/pom.xml b/pom.xml index 2a6b13874..f75ac0320 100644 --- a/pom.xml +++ b/pom.xml @@ -18,7 +18,7 @@ http://maven.apache.org/guides/mini/guide-m1-m2.html org.archive heritrix - 3.4.0-SNAPSHOT + 3.4.0-20200304 pom Heritrix 3 @@ -32,7 +32,7 @@ http://maven.apache.org/guides/mini/guide-m1-m2.html scm:git:https://github.com/internetarchive/heritrix3.git scm:git:https://github.com/internetarchive/heritrix3.git https://github.com/internetarchive/heritrix3 - HEAD + 3.4.0-20200304 From ff05de44b6f5906ff5deef9aaf3a436a51f5c17d Mon Sep 17 00:00:00 2001 From: Andrew Jackson Date: Wed, 4 Mar 2020 23:05:41 +0000 Subject: [PATCH 081/123] [maven-release-plugin] prepare for next development iteration --- commons/pom.xml | 2 +- contrib/pom.xml | 2 +- dist/pom.xml | 2 +- engine/pom.xml | 2 +- modules/pom.xml | 2 +- pom.xml | 4 ++-- 6 files changed, 7 insertions(+), 7 deletions(-) diff --git a/commons/pom.xml b/commons/pom.xml index cf295b7a1..e383a0779 100644 --- a/commons/pom.xml +++ b/commons/pom.xml @@ -3,7 +3,7 @@ org.archive heritrix - 3.4.0-20200304 + 3.4.0-SNAPSHOT 4.0.0 org.archive.heritrix diff --git a/contrib/pom.xml b/contrib/pom.xml index b755d0639..b270574ec 100644 --- a/contrib/pom.xml +++ b/contrib/pom.xml @@ -3,7 +3,7 @@ org.archive heritrix - 3.4.0-20200304 + 3.4.0-SNAPSHOT org.archive.heritrix heritrix-contrib diff --git a/dist/pom.xml b/dist/pom.xml index 0b7fb8526..9a7ddc077 100644 --- a/dist/pom.xml +++ b/dist/pom.xml @@ -3,7 +3,7 @@ org.archive heritrix - 3.4.0-20200304 + 3.4.0-SNAPSHOT 4.0.0 org.archive.heritrix diff --git a/engine/pom.xml b/engine/pom.xml index f6724b55d..79593ec91 100644 --- a/engine/pom.xml +++ b/engine/pom.xml @@ -3,7 +3,7 @@ org.archive heritrix - 3.4.0-20200304 + 3.4.0-SNAPSHOT 4.0.0 org.archive.heritrix diff --git a/modules/pom.xml b/modules/pom.xml index 4e6350611..ff3816aed 100644 --- a/modules/pom.xml +++ b/modules/pom.xml @@ -3,7 +3,7 @@ org.archive heritrix - 3.4.0-20200304 + 3.4.0-SNAPSHOT 4.0.0 org.archive.heritrix diff --git a/pom.xml b/pom.xml index f75ac0320..2a6b13874 100644 --- a/pom.xml +++ b/pom.xml @@ -18,7 +18,7 @@ http://maven.apache.org/guides/mini/guide-m1-m2.html org.archive heritrix - 3.4.0-20200304 + 3.4.0-SNAPSHOT pom Heritrix 3 @@ -32,7 +32,7 @@ http://maven.apache.org/guides/mini/guide-m1-m2.html scm:git:https://github.com/internetarchive/heritrix3.git scm:git:https://github.com/internetarchive/heritrix3.git https://github.com/internetarchive/heritrix3 - 3.4.0-20200304 + HEAD From fc1de3066866c8a73437a6f39ac7c30c21bf8579 Mon Sep 17 00:00:00 2001 From: Andrew Jackson Date: Wed, 4 Mar 2020 23:21:15 +0000 Subject: [PATCH 082/123] Updated changelog post release --- CHANGELOG.md | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f831efb59..1a3f422b6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,8 +1,7 @@ # Change Log -## [Unreleased](https://github.com/internetarchive/heritrix3/tree/HEAD) - -[Full Changelog](https://github.com/internetarchive/heritrix3/compare/3.4.0-20190418...HEAD) +## [3.4.0-20200304](https://github.com/internetarchive/heritrix3/tree/3.4.0-20200304) (2020-03-04) +[Full Changelog](https://github.com/internetarchive/heritrix3/compare/3.4.0-20190418...3.4.0-20200304) **Fixed bugs:** @@ -26,6 +25,8 @@ **Merged pull requests:** +- Use the Wayback Machine to repair a link to Oracle docs. [\#315](https://github.com/internetarchive/heritrix3/pull/315) ([anjackson](https://github.com/anjackson)) +- Utilize the `d` parameter [\#314](https://github.com/internetarchive/heritrix3/pull/314) ([hennekey](https://github.com/hennekey)) - Exclude hbase-client's guava 12 transitive dependency [\#312](https://github.com/internetarchive/heritrix3/pull/312) ([ato](https://github.com/ato)) - Fix stream closed exception for Paged view [\#308](https://github.com/internetarchive/heritrix3/pull/308) ([ldko](https://github.com/ldko)) - Fix stream closed exception by not closing output stream [\#306](https://github.com/internetarchive/heritrix3/pull/306) ([ato](https://github.com/ato)) @@ -45,7 +46,7 @@ - Add support for forced queue assignment and parallel queues [\#286](https://github.com/internetarchive/heritrix3/pull/286) ([adam-miller](https://github.com/adam-miller)) - Warc writer chain [\#285](https://github.com/internetarchive/heritrix3/pull/285) ([nlevitt](https://github.com/nlevitt)) - Fix jobdir PUT [\#283](https://github.com/internetarchive/heritrix3/pull/283) ([ato](https://github.com/ato)) -- WIP: Upgrade BDB JE to version 7.5.11 [\#281](https://github.com/internetarchive/heritrix3/pull/281) ([anjackson](https://github.com/anjackson)) +- Upgrade BDB JE to version 7.5.11 - IMPORTANT CHANGE [\#281](https://github.com/internetarchive/heritrix3/pull/281) ([anjackson](https://github.com/anjackson)) - Mitigate random CookieStore.testConcurrentLoad test failures [\#280](https://github.com/internetarchive/heritrix3/pull/280) ([ato](https://github.com/ato)) - JDK11 support: upgrade to Jetty 9.4.19, Restlet 2.4.0 and drop JDK 7 support [\#276](https://github.com/internetarchive/heritrix3/pull/276) ([ato](https://github.com/ato)) - JDK11 support: remove unused class ObjectIdentityBdbCache and tests [\#273](https://github.com/internetarchive/heritrix3/pull/273) ([ato](https://github.com/ato)) @@ -85,7 +86,6 @@ - Trough dedup [\#242](https://github.com/internetarchive/heritrix3/pull/242) ([nlevitt](https://github.com/nlevitt)) - Ensure we start parsing full lines, for \#239. [\#240](https://github.com/internetarchive/heritrix3/pull/240) ([anjackson](https://github.com/anjackson)) - Add CHANGELOG; address \#233. [\#238](https://github.com/internetarchive/heritrix3/pull/238) ([ruebot](https://github.com/ruebot)) -- fix for test failures in a workspace on NFS-mounted filesystem [\#196](https://github.com/internetarchive/heritrix3/pull/196) ([kngenie](https://github.com/kngenie)) ## [3.4.0-20190207](https://github.com/internetarchive/heritrix3/tree/3.4.0-20190207) (2019-02-07) [Full Changelog](https://github.com/internetarchive/heritrix3/compare/3.4.0-20190205...3.4.0-20190207) @@ -141,6 +141,7 @@ - Fix link to User Guide [\#207](https://github.com/internetarchive/heritrix3/pull/207) ([maurice-schleussinger](https://github.com/maurice-schleussinger)) - Add parameter to allow even distribution for parallel queues. [\#205](https://github.com/internetarchive/heritrix3/pull/205) ([adam-miller](https://github.com/adam-miller)) - catch exceptions scoping outlinks to stop them from derailing process… [\#197](https://github.com/internetarchive/heritrix3/pull/197) ([nlevitt](https://github.com/nlevitt)) +- fix for test failures in a workspace on NFS-mounted filesystem [\#196](https://github.com/internetarchive/heritrix3/pull/196) ([kngenie](https://github.com/kngenie)) - limit max size of form input [\#194](https://github.com/internetarchive/heritrix3/pull/194) ([galgeek](https://github.com/galgeek)) - Enforce robots.txt character limit per char not per line [\#192](https://github.com/internetarchive/heritrix3/pull/192) ([ato](https://github.com/ato)) - Allow JavaDNS to be disabled as part of resolving outstanding build and test issues [\#190](https://github.com/internetarchive/heritrix3/pull/190) ([anjackson](https://github.com/anjackson)) From a384dea2f912e6a12c034b15e1484586ad118176 Mon Sep 17 00:00:00 2001 From: Adam Miller Date: Wed, 25 Mar 2020 22:33:06 +0000 Subject: [PATCH 083/123] Recycle the Matcher after use. --- .../archive/modules/extractor/ExtractorMultipleRegex.java | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/modules/src/main/java/org/archive/modules/extractor/ExtractorMultipleRegex.java b/modules/src/main/java/org/archive/modules/extractor/ExtractorMultipleRegex.java index e3afa0fb3..536ff7b25 100644 --- a/modules/src/main/java/org/archive/modules/extractor/ExtractorMultipleRegex.java +++ b/modules/src/main/java/org/archive/modules/extractor/ExtractorMultipleRegex.java @@ -192,6 +192,7 @@ public MatchList(String regex, CharSequence cs) { while (matcher.find()) { add(new GroupList(matcher)); } + TextUtils.recycleMatcher(matcher); } public MatchList(GroupList... groupList) { for (GroupList x: groupList) { @@ -219,6 +220,7 @@ public void extract(CrawlURI curi) { matchLists = new LinkedHashMap(); matchLists.put("uriRegex", new MatchList(new GroupList(matcher))); } else { + TextUtils.recycleMatcher(matcher); return; // if uri regex doesn't match, we're done } @@ -229,6 +231,7 @@ public void extract(CrawlURI curi) { curi.getNonFatalFailures().add(e); LOGGER.log(Level.WARNING, "Failed get of replay char sequence in " + Thread.currentThread().getName(), e); + TextUtils.recycleMatcher(matcher); return; } @@ -237,6 +240,7 @@ public void extract(CrawlURI curi) { String regex = getContentRegexes().get(regexName); MatchList matchList = new MatchList(regex, cs); if (matchList.isEmpty()) { + TextUtils.recycleMatcher(matcher); return; // no match found for regex, so we can stop now } matchLists.put(regexName, matchList); @@ -257,6 +261,7 @@ public void extract(CrawlURI curi) { Map bindings = makeBindings(matchLists, regexNames, i); buildAndAddOutlink(curi, bindings); } + TextUtils.recycleMatcher(matcher); } // bindings are the variables available to populate the template From 40725372d67d9b30cafe86defa954c066e2d52ef Mon Sep 17 00:00:00 2001 From: Leslie Bellony Date: Wed, 1 Apr 2020 14:47:34 +0200 Subject: [PATCH 084/123] Add support for the SFTP protocol --- commons/pom.xml | 6 +- .../main/java/org/archive/net/ClientSFTP.java | 153 +++++ .../hbase/HBasePersistLoadProcessor.java | 2 +- modules/pom.xml | 5 + .../deciderules/SchemeNotInSetDecideRule.java | 2 +- .../archive/modules/fetcher/FetchSFTP.java | 586 ++++++++++++++++++ .../recrawl/AbstractPersistProcessor.java | 2 +- .../FtpControlConversationRecordBuilder.java | 2 +- .../warc/FtpResponseRecordBuilder.java | 2 +- .../modules/warc/MetadataRecordBuilder.java | 2 +- .../modules/warc/RevisitRecordBuilder.java | 2 +- .../modules/writer/WARCWriterProcessor.java | 2 +- .../modules/writer/WriterPoolProcessor.java | 2 +- 13 files changed, 758 insertions(+), 10 deletions(-) create mode 100644 commons/src/main/java/org/archive/net/ClientSFTP.java create mode 100644 modules/src/main/java/org/archive/modules/fetcher/FetchSFTP.java diff --git a/commons/pom.xml b/commons/pom.xml index e383a0779..889dffbb0 100644 --- a/commons/pom.xml +++ b/commons/pom.xml @@ -195,7 +195,11 @@ - + + com.jcraft + jsch + 0.1.52 + diff --git a/commons/src/main/java/org/archive/net/ClientSFTP.java b/commons/src/main/java/org/archive/net/ClientSFTP.java new file mode 100644 index 000000000..9febbb65f --- /dev/null +++ b/commons/src/main/java/org/archive/net/ClientSFTP.java @@ -0,0 +1,153 @@ +package org.archive.net; + +import com.jcraft.jsch.Channel; +import com.jcraft.jsch.ChannelSftp; +import com.jcraft.jsch.JSch; +import com.jcraft.jsch.JSchException; +import com.jcraft.jsch.Session; +import com.jcraft.jsch.SftpATTRS; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.net.Socket; +import java.net.SocketException; +import java.util.HashMap; +import java.util.Map; +import java.util.Vector; +import java.util.logging.Logger; + +public class ClientSFTP { + private final Logger logger = Logger.getLogger(getClass().getName()); + + protected StringBuilder controlConversation; + + protected Socket dataSocket; + protected static Session session = null; + protected static Channel channel = null; + protected static ChannelSftp channelSFTP = null; + + public ClientSFTP() { + this.controlConversation = new StringBuilder(); + this.logger.setUseParentHandlers(true); + } + + public String getControlConversation() { + return this.controlConversation.toString(); + } + + public void connect(String paramString1, String paramString2, int paramInt, String paramString3) throws SocketException, IOException { + JSch jSch = new JSch(); + + try { + session = jSch.getSession(paramString1, paramString2, paramInt); + session.setConfig("StrictHostKeyChecking", "no"); + session.setPassword(paramString3); + session.connect(); + this.logger.info("Connected to SFTP server " + paramString2 + ":" + paramInt); + this.controlConversation.append("Connected to SFTP server " + paramString2 + ":" + paramInt); + } catch (Exception exception) { + this.logger.info("Unable to connect to SFTP server : " + exception.toString()); + this.controlConversation.append("Unable to connect to SFTP server : " + exception.toString()); + session.disconnect(); + } + } + + public ChannelSftp openSFTPChannel() throws JSchException { + channel = session.openChannel("sftp"); + channel.connect(); + channelSFTP = (ChannelSftp) channel; + this.logger.info("*** SFTP Channel created. ***"); + boolean bool = channelSFTP.isConnected(); + if (bool) + this.logger.info("channelSftp connected "); + return channelSFTP; + } + + public boolean isConnected() { + if (channelSFTP != null) { + return channelSFTP.isConnected(); + } + + return false; + } + + public ChannelSftp getChannelSftp() { + return channelSFTP; + } + + public void exit() { + session.disconnect(); + } + + public boolean isDirectory(String paramString) throws Exception { + SftpATTRS sftpATTRS = channelSFTP.stat(paramString); + return sftpATTRS.isDir(); + } + + public void disconnect() { + if (channelSFTP != null) { + channelSFTP.exit(); + this.logger.info("channelSftp exit...."); + } + if (session != null) { + session.disconnect(); + this.logger.info("session disconnect...."); + } + channelSFTP = null; + } + + @SuppressWarnings({ "unchecked", "rawtypes" }) + public Map getLsFileMap(String paramString) throws Exception { + HashMap hashMap = new HashMap<>(); + + Vector vector = channelSFTP.ls(paramString); + for (byte b = 0; b < vector.size(); b++) { + ChannelSftp.LsEntry lsEntry = vector.get(b); + String str = lsEntry.getFilename(); + + if (!".".equals(str)) { + + if (!"..".equals(str)) { + + String str1 = paramString + "/" + str; + boolean bool = isDirectory(str1); + + this.logger.info("fileName : " + str); + hashMap.put(str, Boolean.valueOf(bool)); + } + } + } + return (Map) hashMap; + } + + protected boolean mkdir(String paramString) throws Exception { + this.logger.info("channelSftp mkdir: " + paramString); + channelSFTP.mkdir(paramString); + return true; + } + + public boolean cd(String paramString) throws Exception { + this.logger.info("channelSftp cd : " + paramString); + channelSFTP.cd(paramString); + return true; + } + + protected boolean put(FileInputStream paramFileInputStream, String paramString) throws Exception { + this.logger.info("channelSftp put: " + paramString); + channelSFTP.put(paramFileInputStream, paramString); + return true; + } + + protected boolean downRemoteSingleFile(String paramString1, String paramString2) throws Exception { + FileOutputStream fileOutputStream = new FileOutputStream(paramString2); + channelSFTP.get(paramString1, fileOutputStream); + + this.logger.info("channelSftp download file: " + paramString1); + return true; + } + + protected boolean rm(String paramString) throws Exception { + channelSFTP.rm(paramString); + return true; + } +} diff --git a/contrib/src/main/java/org/archive/modules/recrawl/hbase/HBasePersistLoadProcessor.java b/contrib/src/main/java/org/archive/modules/recrawl/hbase/HBasePersistLoadProcessor.java index 6a967236d..b72f7d7a0 100644 --- a/contrib/src/main/java/org/archive/modules/recrawl/hbase/HBasePersistLoadProcessor.java +++ b/contrib/src/main/java/org/archive/modules/recrawl/hbase/HBasePersistLoadProcessor.java @@ -80,7 +80,7 @@ protected boolean shouldProcess(CrawlURI uri) { // TODO: we want deduplicate robots.txt, too. //if (uri.isPrerequisite()) return false; String scheme = uri.getUURI().getScheme(); - if (!(scheme.equals("http") || scheme.equals("https") || scheme.equals("ftp"))) { + if (!(scheme.equals("http") || scheme.equals("https") || scheme.equals("ftp") || scheme.equals("sftp"))) { return false; } return true; diff --git a/modules/pom.xml b/modules/pom.xml index ff3816aed..cdf4a52a8 100644 --- a/modules/pom.xml +++ b/modules/pom.xml @@ -62,6 +62,11 @@ 1.6.6 test + + com.jcraft + jsch + 0.1.52 + diff --git a/modules/src/main/java/org/archive/modules/deciderules/SchemeNotInSetDecideRule.java b/modules/src/main/java/org/archive/modules/deciderules/SchemeNotInSetDecideRule.java index f5b1d52b9..23077832f 100644 --- a/modules/src/main/java/org/archive/modules/deciderules/SchemeNotInSetDecideRule.java +++ b/modules/src/main/java/org/archive/modules/deciderules/SchemeNotInSetDecideRule.java @@ -47,7 +47,7 @@ public SchemeNotInSetDecideRule() { protected Set schemes = new HashSet(); { // default set are those schemes Heritrix supports in usual configuration - schemes.addAll(Arrays.asList(new String[] {"http","https","ftp","dns","whois"})); + schemes.addAll(Arrays.asList(new String[] {"http","https","ftp","dns","whois", "sftp"})); } public Set getSchemes() { return schemes; diff --git a/modules/src/main/java/org/archive/modules/fetcher/FetchSFTP.java b/modules/src/main/java/org/archive/modules/fetcher/FetchSFTP.java new file mode 100644 index 000000000..9789de8ae --- /dev/null +++ b/modules/src/main/java/org/archive/modules/fetcher/FetchSFTP.java @@ -0,0 +1,586 @@ +package org.archive.modules.fetcher; + +import static org.archive.modules.CoreAttributeConstants.A_FTP_CONTROL_CONVERSATION; +import static org.archive.modules.CoreAttributeConstants.A_FTP_FETCH_STATUS; +import static org.archive.modules.CoreAttributeConstants.A_RUNTIME_EXCEPTION; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.UnsupportedEncodingException; +import java.net.URLEncoder; +import java.security.MessageDigest; +import java.util.Vector; +import java.util.logging.Level; +import java.util.logging.Logger; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.commons.httpclient.URIException; +import org.archive.io.RecordingInputStream; +import org.archive.io.ReplayCharSequence; +import org.archive.modules.CrawlURI; +import org.archive.modules.Processor; +import org.archive.modules.extractor.Hop; +import org.archive.modules.extractor.LinkContext; +import org.archive.net.ClientSFTP; +import org.archive.net.UURI; +import org.archive.net.UURIFactory; +import org.archive.util.Recorder; + +import com.jcraft.jsch.ChannelSftp; +import com.jcraft.jsch.ChannelSftp.LsEntry; +import com.jcraft.jsch.JSchException; +import com.jcraft.jsch.SftpException; + +public class FetchSFTP extends Processor { + + private static Logger logger = Logger.getLogger(FetchSFTP.class.getName()); + + private static Pattern DIR = Pattern.compile("(.+)$", 8); + + /** + * The username to send to SFTP servers. By convention, the default value of + * "anonymous" is used for publicly available SFTP sites. + */ + { + setUsername("anonymous"); + } + + public String getUsername() { + return (String) kp.get("username"); + } + + public void setUsername(String username) { + kp.put("username", username); + } + + /** + * The password to send to SFTP servers. By convention, anonymous users send + * their email address in this field. + */ + { + setPassword("password"); + } + + public String getPassword() { + return (String) kp.get("password"); + } + + public void setPassword(String pw) { + kp.put("password", pw); + } + + /** + * Set to true to extract further URIs from SFTP directories. Default is true. + */ + { + setExtractFromDirs(true); + } + + /** + * Returns the extract.from.dirs attribute for this + * FetchSFTP and the given curi. + * + * @return that curi's extract.from.dirs + */ + public boolean getExtractFromDirs() { + return (Boolean) kp.get("extractFromDirs"); + } + + public void setExtractFromDirs(boolean extractFromDirs) { + kp.put("extractFromDirs", extractFromDirs); + } + + /** + * Set to true to extract the parent URI from all SFTP URIs. Default is true. + */ + { + setExtractParent(true); + } + + /** + * Returns the extract.parent attribute for this + * FetchSFTP and the given curi. + * + * @return that curi's extract-parent + */ + public boolean getExtractParent() { + return (Boolean) kp.get("extractParent"); + } + + public void setExtractParent(boolean extractParent) { + kp.put("extractParent", extractParent); + } + + /** + * Whether or not to perform an on-the-fly digest hash of retrieved + * content-bodies. + */ + { + setDigestContent(true); + } + + public boolean getDigestContent() { + return (Boolean) kp.get("digestContent"); + } + + public void setDigestContent(boolean digest) { + kp.put("digestContent", digest); + } + + /** + * Which algorithm (for example MD5 or SHA-1) to use to perform an + * on-the-fly digest hash of retrieved content-bodies. + */ + protected String digestAlgorithm = "sha1"; + + public String getDigestAlgorithm() { + return digestAlgorithm; + } + + public void setDigestAlgorithm(String digestAlgorithm) { + this.digestAlgorithm = digestAlgorithm; + } + + /** + * Maximum length in bytes to fetch. Fetch is truncated at this length. A + * value of 0 means no limit. + */ + { + setMaxLengthBytes(0L); // no limit + } + + public long getMaxLengthBytes() { + return (Long) kp.get("maxLengthBytes"); + } + + public void setMaxLengthBytes(long timeout) { + kp.put("maxLengthBytes", timeout); + } + + /** + * The maximum KB/sec to use when fetching data from a server. The default + * of 0 means no maximum. + */ + { + setMaxFetchKBSec(0); // no limit + } + + public int getMaxFetchKBSec() { + return (Integer) kp.get("maxFetchKBSec"); + } + + public void setMaxFetchKBSec(int rate) { + kp.put("maxFetchKBSec", rate); + } + + /** + * If the fetch is not completed in this number of seconds, give up (and + * retry later). + */ + { + setTimeoutSeconds(20 * 60); // 20 minutes + } + + public int getTimeoutSeconds() { + return (Integer) kp.get("timeoutSeconds"); + } + + public void setTimeoutSeconds(int timeout) { + kp.put("timeoutSeconds", timeout); + } + + /** + * If the socket is unresponsive for this number of milliseconds, give up. + * Set to zero for no timeout (Not. recommended. Could hang a thread on an + * unresponsive server). This timeout is used timing out socket opens and + * for timing out each socket read. Make sure this value is < + * {@link #TIMEOUT_SECONDS} for optimal configuration: ensures at least one + * retry read. + */ + { + setSoTimeoutMs(20 * 1000); // 20 seconds + } + + public int getSoTimeoutMs() { + return (Integer) kp.get("soTimeoutMs"); + } + + public void setSoTimeoutMs(int timeout) { + kp.put("soTimeoutMs", timeout); + } + + /** + * Constructs a new FetchSFTP. + */ + public FetchSFTP() { + // + } + + @Override + protected boolean shouldProcess(CrawlURI curi) { + if (!curi.getUURI().getScheme().equals("sftp")) { + return false; + } + + return true; + } + + /** + * Processes the given URI. If the given URI is not an FTP URI, then + * this method does nothing. Otherwise an attempt is made to connect + * to the FTP server. + * + *

+ * If the connection is successful, an attempt will be made to CD to + * the path specified in the URI. If the remote CD command succeeds, + * then it is assumed that the URI represents a directory. If the + * CD command fails, then it is assumed that the URI represents + * a file. + * + *

+ * For directories, the directory listing will be fetched using + * the FTP LIST command, and saved to the HttpRecorder. If the + * extract.from.dirs attribute is set to true, then + * the files in the fetched list will be added to the curi as + * extracted FTP links. (It was easier to do that here, rather + * than writing a separate FTPExtractor.) + * + *

+ * For files, the file will be fetched using the FTP RETR + * command, and saved to the HttpRecorder. + * + *

+ * All file transfers (including directory listings) occur using + * Binary mode transfer. Also, the local passive transfer mode + * is always used, to play well with firewalls. + * + * @param curi the curi to process + * @throws InterruptedException if the thread is interrupted during + * processing + */ + @Override + protected void innerProcess(CrawlURI curi) throws InterruptedException { + curi.setFetchBeginTime(System.currentTimeMillis()); + ClientSFTP client = new ClientSFTP(); + Recorder recorder = curi.getRecorder(); + + try { + if (logger.isLoggable(Level.FINE)) { + logger.fine("attempting to fetch sftp uri: " + curi); + } + fetch(curi, client, recorder); + } catch (IOException e) { + if (logger.isLoggable(Level.INFO)) { + logger.info(curi + ": " + e); + } + curi.getNonFatalFailures().add(e); + curi.setFetchStatus(FetchStatusCodes.S_CONNECT_FAILED); + } finally { + disconnect(client); + curi.setFetchCompletedTime(System.currentTimeMillis()); + curi.getData().put(A_FTP_CONTROL_CONVERSATION, client.getControlConversation()); + } + } + + /** + * Fetches a document from an FTP server. + * + * @param curi the URI of the document to fetch + * @param client the FTPClient to use for the fetch + * @param recorder the recorder to preserve the document in + * @throws IOException if a network or protocol error occurs + * @throws InterruptedException if the thread is interrupted + */ + private void fetch(CrawlURI curi, ClientSFTP client, Recorder recorder) + throws IOException, InterruptedException { + // Connect to the FTP server. + UURI uuri = curi.getUURI(); + int port = uuri.getPort(); + if (port == -1) { + port = 22; + } + + String[] arrayOfString = getAuth(curi); + client.connect(arrayOfString[0], uuri.getHost(), port, arrayOfString[1]); + + ChannelSftp channelSftp = null; + + try { + channelSftp = client.openSFTPChannel(); + } catch (JSchException jSchException) { + jSchException.printStackTrace(); + curi.getData().put(A_RUNTIME_EXCEPTION, jSchException); + } + + boolean bool = false; + try { + client.cd(uuri.getPath()); + bool = client.isDirectory(uuri.getPath()); + } catch (SftpException sftpException) { + + try { + client.cd("/"); + } catch (Exception exception) { + logger.severe("cannot cd /"); + curi.getData().put(A_RUNTIME_EXCEPTION, sftpException); + } + } catch (Exception exception) { + exception.printStackTrace(); + curi.getData().put(A_RUNTIME_EXCEPTION, exception); + } + + if (bool) { + curi.getAnnotations().add("sftpDirectoryList"); + } + + if (channelSftp != null) { + + boolean digestContent = getDigestContent(); + String algorithm = null; + if (digestContent) { + algorithm = getDigestAlgorithm(); + recorder.getRecordedInput().setDigest(algorithm); + recorder.getRecordedInput().startDigest(); + } else { + recorder.getRecordedInput().setDigest((MessageDigest) null); + } + + try { + if (bool) { + saveDirectoryToRecorder(curi, channelSftp, recorder); + + curi.setFetchStatus(226); + curi.getData().put(A_FTP_FETCH_STATUS, "226 Directory send OK."); + } else { + saveFileToRecorder(curi, channelSftp, recorder); + + curi.setFetchStatus(226); + curi.getData().put(A_FTP_FETCH_STATUS, "226 File send OK."); + } + } catch (SftpException sftpException) { + logger.severe("error while getting " + curi + " :" + sftpException); + curi.setFetchStatus(-3); + curi.getData().put(A_FTP_FETCH_STATUS, "SFTP error."); + } finally { + + recorder.close(); + client.disconnect(); + curi.setContentSize(recorder.getRecordedInput().getSize()); + + if (bool) { + curi.setContentType("text/plain"); + } else { + curi.setContentType("application/octet-stream"); + } + + if (logger.isLoggable(Level.INFO)) { + logger.fine("read " + recorder.getRecordedInput().getSize() + " bytes from sftp data socket"); + } + + if (digestContent) { + curi.setContentDigest(algorithm, recorder.getRecordedInput().getDigestValue()); + } + } + curi.getData().put(A_RUNTIME_EXCEPTION, client.getControlConversation()); + if (bool) { + extract(curi, recorder); + } + } + + addParent(curi); + } + + @SuppressWarnings("unchecked") + private void saveDirectoryToRecorder(CrawlURI paramCrawlURI, ChannelSftp paramChannelSftp, Recorder paramHttpRecorder) + throws IOException, InterruptedException { + try { + Vector vector = paramChannelSftp.ls(paramCrawlURI.getUURI().getPath()); + StringBuilder stringBuilder = new StringBuilder(); + for (LsEntry lsEntry : vector) { + stringBuilder.append(lsEntry.getFilename()); + stringBuilder.append('\n'); + } + ByteArrayInputStream byteArrayInputStream = new ByteArrayInputStream(stringBuilder.toString().getBytes()); + + paramCrawlURI.setRecorder(paramHttpRecorder); + paramHttpRecorder.inputWrap(byteArrayInputStream); + paramHttpRecorder.outputWrap(paramChannelSftp.getOutputStream()); + paramHttpRecorder.markContentBegin(); + + long softMax = 0; + long hardMax = getMaxLengthBytes(); + long timeout = (long) getTimeoutSeconds() * 1000L; + int maxRate = getMaxFetchKBSec(); + + RecordingInputStream recordingInputStream = paramHttpRecorder.getRecordedInput(); + recordingInputStream.setLimits(hardMax, timeout, maxRate); + recordingInputStream.readFullyOrUntil(softMax); + } catch (SftpException sftpException) { + logger.severe("ls : " + paramCrawlURI.getUURI().getPath() + " not a path"); + } + } + + private void saveFileToRecorder(CrawlURI paramCrawlURI, ChannelSftp paramChannelSftp, Recorder paramHttpRecorder) + throws IOException, InterruptedException, SftpException { + String str = new String(paramCrawlURI.getUURI().getPath()); + + InputStream inputStream = paramChannelSftp.get(str); + + paramCrawlURI.setRecorder(paramHttpRecorder); + paramHttpRecorder.inputWrap(inputStream); + paramHttpRecorder.outputWrap(paramChannelSftp.getOutputStream()); + paramHttpRecorder.markContentBegin(); + + long softMax = 0; + long hardMax = getMaxLengthBytes(); + long timeout = (long) getTimeoutSeconds() * 1000L; + int maxRate = getMaxFetchKBSec(); + + RecordingInputStream recordingInputStream = paramHttpRecorder.getRecordedInput(); + recordingInputStream.setLimits(hardMax, timeout, maxRate); + recordingInputStream.readFullyOrUntil(softMax); + } + + private void extract(CrawlURI paramCrawlURI, Recorder paramHttpRecorder) { + if (!getExtractFromDirs()) { + return; + } + + ReplayCharSequence replayCharSequence = null; + try { + replayCharSequence = paramHttpRecorder.getContentReplayCharSequence(); + extract(paramCrawlURI, replayCharSequence); + } catch (IOException iOException) { + logger.log(Level.SEVERE, "IO error during extraction.", iOException); + } catch (RuntimeException runtimeException) { + logger.log(Level.SEVERE, "IO error during extraction.", runtimeException); + } finally { + close(replayCharSequence); + } + } + + private void extract(CrawlURI paramCrawlURI, ReplayCharSequence paramReplayCharSequence) { + Matcher matcher = DIR.matcher((CharSequence) paramReplayCharSequence); + + while (matcher.find()) { + String str = matcher.group(1); + + addExtracted(paramCrawlURI, str); + } + } + + /** + * Adds an extracted filename to the curi. A new URI will be formed + * by taking the given curi (which should represent the directory the + * file lives in) and appending the file. + * + * @param curi the curi to store the discovered link in + * @param file the filename of the discovered link + */ + private void addExtracted(CrawlURI curi, String file) { + try { + file = URLEncoder.encode(file, "UTF-8"); + } catch (UnsupportedEncodingException e) { + throw new AssertionError(e); + } + if (logger.isLoggable(Level.FINEST)) { + logger.log(Level.FINEST, "Found " + file); + } + String base = curi.toString(); + if (base.endsWith("/")) { + base = base.substring(0, base.length() - 1); + } + try { + UURI n = UURIFactory.getInstance(base + "/" + file); + CrawlURI link = curi.createCrawlURI(n, LinkContext.NAVLINK_MISC, Hop.NAVLINK); + curi.getOutLinks().add(link); + } catch (URIException e) { + logger.log(Level.WARNING, "URI error during extraction.", e); + } + } + + private void addParent(CrawlURI curi) { + if (!getExtractParent()) { + return; + } + UURI uuri = curi.getUURI(); + try { + if (uuri.getPath().equals("/")) { + return; + } + + String scheme = uuri.getScheme(); + String auth = uuri.getEscapedAuthority(); + String path = uuri.getEscapedCurrentHierPath(); + UURI parent = UURIFactory.getInstance(scheme + "://" + auth + path); + + CrawlURI link = curi.createCrawlURI(parent, LinkContext.NAVLINK_MISC, + Hop.NAVLINK); + curi.getOutLinks().add(link); + } catch (URIException uRIException) { + logger.log(Level.WARNING, "URI error during extraction.", (Throwable) uRIException); + } + } + + /** + * Returns the username and password for the given URI. This method + * always returns an array of length 2. The first element in the returned + * array is the username for the URI, and the second element is the + * password. + * + *

+ * If the URI itself contains the username and password (i.e., it looks + * like sftp://username:password@host/path) then that username + * and password are returned. + * + *

+ * Otherwise the settings system is probed for the username + * and password attributes for this SFTPFetch + * and the given curi context. The values of those + * attributes are then returned. + * + * @param curi the curi whose username and password to return + * @return an array containing the username and password + */ + private String[] getAuth(CrawlURI curi) { + String[] result = new String[2]; + UURI uuri = curi.getUURI(); + String userinfo; + try { + userinfo = uuri.getUserinfo(); + } catch (URIException e) { + assert false; + logger.finest("getUserinfo raised URIException."); + userinfo = null; + } + if (userinfo != null) { + int p = userinfo.indexOf(':'); + if (p > 0) { + result[0] = userinfo.substring(0, p); + result[1] = userinfo.substring(p + 1); + return result; + } + } + result[0] = getUsername(); + result[1] = getPassword(); + return result; + } + + private static void close(ReplayCharSequence paramReplayCharSequence) { + if (paramReplayCharSequence == null) { + return; + } + try { + paramReplayCharSequence.close(); + } catch (IOException iOException) { + logger.log(Level.WARNING, "IO error closing ReplayCharSequence.", iOException); + } + } + + private static void disconnect(ClientSFTP paramClientSFTP) { + if (paramClientSFTP.isConnected()) + paramClientSFTP.disconnect(); + } +} diff --git a/modules/src/main/java/org/archive/modules/recrawl/AbstractPersistProcessor.java b/modules/src/main/java/org/archive/modules/recrawl/AbstractPersistProcessor.java index e919c26d7..a014e662c 100644 --- a/modules/src/main/java/org/archive/modules/recrawl/AbstractPersistProcessor.java +++ b/modules/src/main/java/org/archive/modules/recrawl/AbstractPersistProcessor.java @@ -50,7 +50,7 @@ protected boolean shouldStore(CrawlURI curi) { // DNS query need not be persisted String scheme = curi.getUURI().getScheme(); - if (!(scheme.equals("http") || scheme.equals("https") || scheme.equals("ftp"))) { + if (!(scheme.equals("http") || scheme.equals("https") || scheme.equals("ftp") || scheme.equals("sftp"))) { return false; } diff --git a/modules/src/main/java/org/archive/modules/warc/FtpControlConversationRecordBuilder.java b/modules/src/main/java/org/archive/modules/warc/FtpControlConversationRecordBuilder.java index 6322c5a78..f7cc42e69 100644 --- a/modules/src/main/java/org/archive/modules/warc/FtpControlConversationRecordBuilder.java +++ b/modules/src/main/java/org/archive/modules/warc/FtpControlConversationRecordBuilder.java @@ -19,7 +19,7 @@ public class FtpControlConversationRecordBuilder extends BaseWARCRecordBuilder { @Override public boolean shouldBuildRecord(CrawlURI curi) { - return "ftp".equals(curi.getUURI().getScheme().toLowerCase()); + return "ftp".equalsIgnoreCase(curi.getUURI().getScheme()) || "sftp".equalsIgnoreCase(curi.getUURI().getScheme()); } @Override diff --git a/modules/src/main/java/org/archive/modules/warc/FtpResponseRecordBuilder.java b/modules/src/main/java/org/archive/modules/warc/FtpResponseRecordBuilder.java index 80b63e72b..ecf1b80be 100644 --- a/modules/src/main/java/org/archive/modules/warc/FtpResponseRecordBuilder.java +++ b/modules/src/main/java/org/archive/modules/warc/FtpResponseRecordBuilder.java @@ -18,7 +18,7 @@ public class FtpResponseRecordBuilder extends BaseWARCRecordBuilder { @Override public boolean shouldBuildRecord(CrawlURI curi) { return !curi.isRevisit() - && "ftp".equals(curi.getUURI().getScheme().toLowerCase()); + && ("ftp".equalsIgnoreCase(curi.getUURI().getScheme()) || "sftp".equalsIgnoreCase(curi.getUURI().getScheme())); } @Override diff --git a/modules/src/main/java/org/archive/modules/warc/MetadataRecordBuilder.java b/modules/src/main/java/org/archive/modules/warc/MetadataRecordBuilder.java index c167740f4..15fd1c04c 100644 --- a/modules/src/main/java/org/archive/modules/warc/MetadataRecordBuilder.java +++ b/modules/src/main/java/org/archive/modules/warc/MetadataRecordBuilder.java @@ -24,7 +24,7 @@ public class MetadataRecordBuilder extends BaseWARCRecordBuilder { @Override public boolean shouldBuildRecord(CrawlURI curi) { String scheme = curi.getUURI().getScheme().toLowerCase(); - return scheme.startsWith("http") || "ftp".equals(scheme); + return scheme.startsWith("http") || "ftp".equals(scheme) || "sftp".equals(scheme); } @Override diff --git a/modules/src/main/java/org/archive/modules/warc/RevisitRecordBuilder.java b/modules/src/main/java/org/archive/modules/warc/RevisitRecordBuilder.java index f2382562c..87f0e5eb3 100644 --- a/modules/src/main/java/org/archive/modules/warc/RevisitRecordBuilder.java +++ b/modules/src/main/java/org/archive/modules/warc/RevisitRecordBuilder.java @@ -25,7 +25,7 @@ public class RevisitRecordBuilder extends BaseWARCRecordBuilder { public boolean shouldBuildRecord(CrawlURI curi) { String scheme = curi.getUURI().getScheme().toLowerCase(); return curi.isRevisit() - && (scheme.startsWith("http") || scheme.equals("ftp")); + && (scheme.startsWith("http") || scheme.equals("ftp") || scheme.equals("sftp")); } @Override diff --git a/modules/src/main/java/org/archive/modules/writer/WARCWriterProcessor.java b/modules/src/main/java/org/archive/modules/writer/WARCWriterProcessor.java index 4726a0079..d25f5afa3 100644 --- a/modules/src/main/java/org/archive/modules/writer/WARCWriterProcessor.java +++ b/modules/src/main/java/org/archive/modules/writer/WARCWriterProcessor.java @@ -184,7 +184,7 @@ protected ProcessResult write(final String lowerCaseScheme, writeHttpRecords(curi, writer, baseid, timestamp); } else if (lowerCaseScheme.equals("dns")) { writeDnsRecords(curi, writer, baseid, timestamp); - } else if (lowerCaseScheme.equals("ftp")) { + } else if (lowerCaseScheme.equals("ftp") || lowerCaseScheme.equals("sftp")) { writeFtpRecords(writer, curi, baseid, timestamp); } else if (lowerCaseScheme.equals("whois")) { writeWhoisRecords(writer, curi, baseid, timestamp); diff --git a/modules/src/main/java/org/archive/modules/writer/WriterPoolProcessor.java b/modules/src/main/java/org/archive/modules/writer/WriterPoolProcessor.java index a5a030e81..72468bf8c 100644 --- a/modules/src/main/java/org/archive/modules/writer/WriterPoolProcessor.java +++ b/modules/src/main/java/org/archive/modules/writer/WriterPoolProcessor.java @@ -346,7 +346,7 @@ protected boolean shouldWrite(CrawlURI curi) { retVal = curi.getFetchStatus() == S_WHOIS_SUCCESS; } else if (scheme.equals("http") || scheme.equals("https")) { retVal = curi.getFetchStatus() > 0 && curi.isHttpTransaction(); - } else if (scheme.equals("ftp")) { + } else if (scheme.equals("ftp") || scheme.equals("sftp")) { retVal = curi.getFetchStatus() > 0; } else { logger.info("This writer does not write out scheme " + From 0a2f57fa73e65ee36c82c3556b849669b590c793 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Fri, 3 Apr 2020 16:18:07 -0700 Subject: [PATCH 085/123] best medium-ish size --- .../java/org/archive/modules/extractor/ExtractorYoutubeDL.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/src/main/java/org/archive/modules/extractor/ExtractorYoutubeDL.java b/contrib/src/main/java/org/archive/modules/extractor/ExtractorYoutubeDL.java index 661e06691..066959f78 100644 --- a/contrib/src/main/java/org/archive/modules/extractor/ExtractorYoutubeDL.java +++ b/contrib/src/main/java/org/archive/modules/extractor/ExtractorYoutubeDL.java @@ -419,7 +419,7 @@ protected YoutubeDLResults runYoutubeDL(CrawlURI uri) { * https://github.com/ytdl-org/youtube-dl/blob/master/README.md#format-selection */ ProcessBuilder pb = new ProcessBuilder("youtube-dl", "--ignore-config", - "--simulate", "--dump-single-json", "--format=best", + "--simulate", "--dump-single-json", "--format=best[height <=? 576]", "--playlist-end=" + MAX_VIDEOS_PER_PAGE, uri.toString()); logger.info("running: " + String.join(" ", pb.command())); From 6710c8dba38cf8fce6281e36fc6ae0dd1bece5ed Mon Sep 17 00:00:00 2001 From: Clara Wiatrowski Date: Fri, 10 Apr 2020 15:59:50 +0200 Subject: [PATCH 086/123] Add parsing for HTML tags (data-src, data-srcset, data-original and data-original-set) --- .../modules/extractor/ExtractorHTML.java | 1975 +++++++++-------- .../modules/extractor/HTMLLinkContext.java | 19 + .../modules/extractor/ExtractorHTMLTest.java | 137 +- 3 files changed, 1143 insertions(+), 988 deletions(-) diff --git a/modules/src/main/java/org/archive/modules/extractor/ExtractorHTML.java b/modules/src/main/java/org/archive/modules/extractor/ExtractorHTML.java index ebcaf47a0..22f75d944 100644 --- a/modules/src/main/java/org/archive/modules/extractor/ExtractorHTML.java +++ b/modules/src/main/java/org/archive/modules/extractor/ExtractorHTML.java @@ -28,8 +28,10 @@ import java.util.logging.Level; import java.util.logging.Logger; import java.util.regex.Matcher; +import java.util.regex.Pattern; import org.apache.commons.httpclient.URIException; +import org.apache.commons.lang.StringUtils; import org.archive.io.ReplayCharSequence; import org.archive.modules.CoreAttributeConstants; import org.archive.modules.CrawlMetadata; @@ -43,1000 +45,999 @@ import org.springframework.beans.factory.InitializingBean; import org.springframework.beans.factory.annotation.Autowired; +import com.google.common.base.Strings; + +import au.id.jericho.lib.html.Element; + /** - * Basic link-extraction, from an HTML content-body, - * using regular expressions. + * Basic link-extraction, from an HTML content-body, using regular expressions. * - * NOTE: This processor may open a ReplayCharSequence from the - * CrawlURI's Recorder, without closing that ReplayCharSequence, to allow - * reuse by later processors in sequence. In the usual (Heritrix) case, a - * call after all processing to the Recorder's endReplays() method ensures - * timely close of any reused ReplayCharSequences. Reuse of this processor - * elsewhere should ensure a similar cleanup call to Recorder.endReplays() - * occurs. + * NOTE: This processor may open a ReplayCharSequence from the CrawlURI's + * Recorder, without closing that ReplayCharSequence, to allow reuse by later + * processors in sequence. In the usual (Heritrix) case, a call after all + * processing to the Recorder's endReplays() method ensures timely close of any + * reused ReplayCharSequences. Reuse of this processor elsewhere should ensure a + * similar cleanup call to Recorder.endReplays() occurs. * - * TODO: Compare against extractors based on HTML parsing libraries for + * TODO: Compare against extractors based on HTML parsing libraries for * accuracy, completeness, and speed. * * @author gojomo */ public class ExtractorHTML extends ContentExtractor implements InitializingBean { - @SuppressWarnings("unused") - private static final long serialVersionUID = 2L; - - private static Logger logger = - Logger.getLogger(ExtractorHTML.class.getName()); - - private final static String MAX_ELEMENT_REPLACE = "MAX_ELEMENT"; - - private final static String MAX_ATTR_NAME_REPLACE = "MAX_ATTR_NAME"; - - private final static String MAX_ATTR_VAL_REPLACE = "MAX_ATTR_VAL"; - - public final static String A_META_ROBOTS = "meta-robots"; - - public final static String A_FORM_OFFSETS = "form-offsets"; - - { - setMaxElementLength(64); - } - public int getMaxElementLength() { - return (Integer) kp.get("maxElementLength"); - } - public void setMaxElementLength(int max) { - kp.put("maxElementLength",max); - } - - - /** - * Relevant tag extractor. - * - *

- * This pattern extracts either: - *

- *
    - *
  • (1) whole <script>...</script> or - *
  • (2) <style>...</style> or - *
  • (3) <meta ...> or - *
  • (4) any other open-tag with at least one attribute (eg matches - * "<a href='boo'>" but not "</a>" or "<br>") - *
- *

- * groups: - *

- *
    - *
  • 1: SCRIPT SRC=foo>boo</SCRIPT - *
  • 2: just script open tag - *
  • 3: STYLE TYPE=moo>zoo</STYLE - *
  • 4: just style open tag - *
  • 5: entire other tag, without '<' '>' - *
  • 6: element - *
  • 7: META - *
  • 8: !-- comment -- - *
- * - *

- * HER-1998 - Modified part 8 to allow conditional html comments. - * Conditional HTML comment example: - * "<!--[if expression]> HTML <![endif]-->" - *

- * - *

- * This technique is commonly used to reference CSS & JavaScript that - * are designed to deal with the quirks of a specific version of Internet - * Explorer. There is another syntax for conditional comments which already - * gets parsed by the regex since it doesn't start with "<!--" Ex. - * <!if expression> HTML <!endif> - *

- * - *

- * https://en.wikipedia.org/wiki/Conditional_Comments - *

- */ - // version w/ less unnecessary backtracking - static final String RELEVANT_TAG_EXTRACTOR = - "(?is)<(?:((script[^>]*+)>.*?]*+)>.*?]*+)" + // 5, 6, 7 - "|(!--(?!\\[if|>).*?--))>"; // 8 - -// version w/ problems with unclosed script tags -// static final String RELEVANT_TAG_EXTRACTOR = -// "(?is)<(?:((script.*?)>.*?.*?"; - - - -// // this pattern extracts 'href' or 'src' attributes from -// // any open-tag innards matched by the above -// static Pattern RELEVANT_ATTRIBUTE_EXTRACTOR = Pattern.compile( -// "(?is)(\\w+)(?:\\s+|(?:\\s.*?\\s))(?:(href)|(src))\\s*=(?:(?:\\s*\"(.+?)\")|(?:\\s*'(.+?)')|(\\S+))"); -// -// // this pattern extracts 'robots' attributes -// static Pattern ROBOTS_ATTRIBUTE_EXTRACTOR = Pattern.compile( -// "(?is)(\\w+)\\s+.*?(?:(robots))\\s*=(?:(?:\\s*\"(.+)\")|(?:\\s*'(.+)')|(\\S+))"); - - { - setMaxAttributeNameLength(64); // 64 chars - } - - public int getMaxAttributeNameLength() { - return (Integer) kp.get("maxAttributeNameLength"); - } - - public void setMaxAttributeNameLength(int max) { - kp.put("maxAttributeNameLength", max); - } - - - { - setMaxAttributeValLength(2048); // 2K - } - - public int getMaxAttributeValLength() { - return (Integer) kp.get("maxAttributeValLength"); - } - - public void setMaxAttributeValLength(int max) { - kp.put("maxAttributeValLength", max); - } - - // TODO: perhaps cut to near MAX_URI_LENGTH - - // this pattern extracts attributes from any open-tag innards - // matched by the above. attributes known to be URIs of various - // sorts are matched specially - static final String EACH_ATTRIBUTE_EXTRACTOR = - "(?is)\\s?((href)|(action)|(on\\w*)" // 1, 2, 3, 4 - +"|((?:src)|(?:srcset)|(?:lowsrc)|(?:background)|(?:cite)" // ... - +"|(?:longdesc)|(?:usemap)|(?:profile)|(?:datasrc))" // 5 - +"|(codebase)|((?:classid)|(?:data))|(archive)|(code)" // 6, 7, 8, 9 - +"|(value)|(style)|(method)" // 10, 11, 12 - +"|([-\\w]{1,"+MAX_ATTR_NAME_REPLACE+"}))" // 13 - +"\\s*=\\s*" - +"(?:(?:\"(.{0,"+MAX_ATTR_VAL_REPLACE+"}?)(?:\"|$))" // 14 - +"|(?:'(.{0,"+MAX_ATTR_VAL_REPLACE+"}?)(?:'|$))" // 15 - +"|(\\S{1,"+MAX_ATTR_VAL_REPLACE+"}))"; // 16 - // groups: - // 1: attribute name - // 2: HREF - single URI relative to doc base, or occasionally javascript: - // 3: ACTION - single URI relative to doc base, or occasionally javascript: - // 4: ON[WHATEVER] - script handler - // 5: SRC,SRCSET,LOWSRC,BACKGROUND,CITE,LONGDESC,USEMAP,PROFILE, or DATASRC - // single URI relative to doc base - // 6: CODEBASE - a single URI relative to doc base, affecting other - // attributes - // 7: CLASSID, DATA - a single URI relative to CODEBASE (if supplied) - // 8: ARCHIVE - one or more space-delimited URIs relative to CODEBASE - // (if supplied) - // 9: CODE - a single URI relative to the CODEBASE (is specified). - // 10: VALUE - often includes a uri path on forms - // 11: STYLE - inline attribute style info - // 12: METHOD - form GET/POST - // 13: any other attribute - // 14: double-quote delimited attr value - // 15: single-quote delimited attr value - // 16: space-delimited attr value - - - static final String WHITESPACE = "\\s"; - static final String CLASSEXT =".class"; - static final String APPLET = "applet"; - static final String BASE = "base"; - static final String LINK = "link"; - static final String FRAME = "frame"; - static final String IFRAME = "iframe"; - - - /** - * If true, FRAME/IFRAME SRC-links are treated as embedded resources (like - * IMG, 'E' hop-type), otherwise they are treated as navigational links. - * Default is true. - */ - { - setTreatFramesAsEmbedLinks(true); - } - public boolean getTreatFramesAsEmbedLinks() { - return (Boolean) kp.get("treatFramesAsEmbedLinks"); - } - public void setTreatFramesAsEmbedLinks(boolean asEmbeds) { - kp.put("treatFramesAsEmbedLinks",asEmbeds); - } - - /** - * If true, URIs appearing as the ACTION attribute in HTML FORMs are - * ignored. Default is false. - */ - { - setIgnoreFormActionUrls(false); - } - public boolean getIgnoreFormActionUrls() { - return (Boolean) kp.get("ignoreFormActionUrls"); - } - public void setIgnoreFormActionUrls(boolean ignoreActions) { - kp.put("ignoreFormActionUrls",ignoreActions); - } - - /** - * If true, only ACTION URIs with a METHOD of GET (explicit or implied) - * are extracted. Default is true. - */ - { - setExtractOnlyFormGets(true); - } - public boolean getExtractOnlyFormGets() { - return (Boolean) kp.get("extractOnlyFormGets"); - } - public void setExtractOnlyFormGets(boolean onlyGets) { - kp.put("extractOnlyFormGets",onlyGets); - } - - /** - * If true, in-page Javascript is scanned for strings that - * appear likely to be URIs. This typically finds both valid - * and invalid URIs, and attempts to fetch the invalid URIs - * sometimes generates webmaster concerns over odd crawler - * behavior. Default is true. - */ - { - setExtractJavascript(true); - } - public boolean getExtractJavascript() { - return (Boolean) kp.get("extractJavascript"); - } - public void setExtractJavascript(boolean extractJavascript) { - kp.put("extractJavascript",extractJavascript); - } - - /** - * If true, strings that look like URIs found in unusual places (such as - * form VALUE attributes) will be extracted. This typically finds both valid - * and invalid URIs, and attempts to fetch the invalid URIs sometimes - * generate webmaster concerns over odd crawler behavior. Default is true. - */ - { - setExtractValueAttributes(true); - } - public boolean getExtractValueAttributes() { - return (Boolean) kp.get("extractValueAttributes"); - } - public void setExtractValueAttributes(boolean extractValueAttributes) { - kp.put("extractValueAttributes",extractValueAttributes); - } - - /** - * If true, URIs which end in typical non-HTML extensions (such as .gif) - * will not be scanned as if it were HTML. Default is true. - */ - { - setIgnoreUnexpectedHtml(true); - } - public boolean getIgnoreUnexpectedHtml() { - return (Boolean) kp.get("ignoreUnexpectedHtml"); - } - public void setIgnoreUnexpectedHtml(boolean ignoreUnexpectedHtml) { - kp.put("ignoreUnexpectedHtml",ignoreUnexpectedHtml); - } - - /** - * CrawlMetadata provides the robots honoring policy to use when - * considering a robots META tag. - */ - protected CrawlMetadata metadata; - public CrawlMetadata getMetadata() { - return metadata; - } - @Autowired - public void setMetadata(CrawlMetadata provider) { - this.metadata = provider; - } - - /** - * Javascript extractor to use to process inline javascript. Autowired if - * available. If null, links will not be extracted from inline javascript. - */ - transient protected ExtractorJS extractorJS; - public ExtractorJS getExtractorJS() { - return extractorJS; - } - @Autowired - public void setExtractorJS(ExtractorJS extractorJS) { - this.extractorJS = extractorJS; - } - - // TODO: convert to Strings - private String relevantTagPattern; - private String eachAttributePattern; - - public ExtractorHTML() { - } - - public void afterPropertiesSet() { - String regex = RELEVANT_TAG_EXTRACTOR; - regex = regex.replace(MAX_ELEMENT_REPLACE, - Integer.toString(getMaxElementLength())); - this.relevantTagPattern = regex; - - regex = EACH_ATTRIBUTE_EXTRACTOR; - regex = regex.replace(MAX_ATTR_NAME_REPLACE, - Integer.toString(getMaxAttributeNameLength())); - regex = regex.replace(MAX_ATTR_VAL_REPLACE, - Integer.toString(getMaxAttributeValLength())); - this.eachAttributePattern = regex; - } - - - protected void processGeneralTag(CrawlURI curi, CharSequence element, - CharSequence cs) { - - Matcher attr = TextUtils.getMatcher(eachAttributePattern,cs); - - // Just in case it's an OBJECT or APPLET tag - String codebase = null; - ArrayList resources = null; - - // Just in case it's a FORM - CharSequence action = null; - CharSequence actionContext = null; - CharSequence method = null; - - // Just in case it's a VALUE whose interpretation depends on accompanying NAME - CharSequence valueVal = null; - CharSequence valueContext = null; - CharSequence nameVal = null; - - final boolean framesAsEmbeds = - getTreatFramesAsEmbedLinks(); - - final boolean ignoreFormActions = - getIgnoreFormActionUrls(); - - final boolean extractValueAttributes = - getExtractValueAttributes(); - - final String elementStr = element.toString(); - - while (attr.find()) { - int valueGroup = - (attr.start(14) > -1) ? 14 : (attr.start(15) > -1) ? 15 : 16; - int start = attr.start(valueGroup); - int end = attr.end(valueGroup); - assert start >= 0: "Start is: " + start + ", " + curi; - assert end >= 0: "End is :" + end + ", " + curi; - CharSequence value = cs.subSequence(start, end); - CharSequence attrName = cs.subSequence(attr.start(1),attr.end(1)); - value = TextUtils.unescapeHtml(value); - if (attr.start(2) > -1) { - CharSequence context; - // HREF - if ("a".equals(element) && TextUtils.matches("(?i).*data-remote\\s*=\\s*([\"'])true.*\\1", cs)) { - context = "a[data-remote='true']/@href"; - } else { - context = elementContext(element, attr.group(2)); - } - - if ("a[data-remote='true']/@href".equals(context) || elementStr.equalsIgnoreCase(LINK)) { - // elements treated as embeds (css, ico, etc) - processEmbed(curi, value, context); - } else { - // other HREFs treated as links - processLink(curi, value, context); - } - // Set the relative or absolute base URI if it's not already been modified. - // See https://github.com/internetarchive/heritrix3/pull/209 - if (elementStr.equalsIgnoreCase(BASE) && !curi.containsDataKey(CoreAttributeConstants.A_HTML_BASE)) { - try { - UURI base = UURIFactory.getInstance(curi.getUURI(),value.toString()); - curi.setBaseURI(base); - } catch (URIException e) { - logUriError(e, curi.getUURI(), value); - } - } - } else if (attr.start(3) > -1) { - // ACTION - if (!ignoreFormActions) { - action = value; - actionContext = elementContext(element, attr.group(3)); - // handling finished only at end (after METHOD also collected) - } - } else if (attr.start(4) > -1) { - // ON____ - processScriptCode(curi, value); // TODO: context? - } else if (attr.start(5) > -1) { - // SRC etc. - CharSequence context = elementContext(element, attr.group(5)); - if (!context.toString().toLowerCase().startsWith("data:")) { - - // true, if we expect another HTML page instead of an image etc. - final Hop hop; - - if (!framesAsEmbeds - && (elementStr.equalsIgnoreCase(FRAME) || elementStr - .equalsIgnoreCase(IFRAME))) { - hop = Hop.NAVLINK; - } else { - hop = Hop.EMBED; - } - processEmbed(curi, value, context, hop); - } - } else if (attr.start(6) > -1) { - // CODEBASE - codebase = (value instanceof String)? - (String)value: value.toString(); - CharSequence context = elementContext(element, - attr.group(6)); - processLink(curi, codebase, context); - } else if (attr.start(7) > -1) { - // CLASSID, DATA - if (resources == null) { - resources = new ArrayList(); - } - resources.add(value.toString()); - } else if (attr.start(8) > -1) { - // ARCHIVE - if (resources==null) { - resources = new ArrayList(); - } - String[] multi = TextUtils.split(WHITESPACE, value); - for(int i = 0; i < multi.length; i++ ) { - resources.add(multi[i]); - } - } else if (attr.start(9) > -1) { - // CODE - if (resources==null) { - resources = new ArrayList(); - } - // If element is applet and code value does not end with - // '.class' then append '.class' to the code value. - if (elementStr.equalsIgnoreCase(APPLET) && - !value.toString().toLowerCase().endsWith(CLASSEXT)) { - resources.add(value.toString() + CLASSEXT); - } else { - resources.add(value.toString()); - } - } else if (attr.start(10) > -1) { - // VALUE, with possibility of URI - // store value, context for handling at end - valueVal = value; - valueContext = elementContext(element,attr.group(10)); - } else if (attr.start(11) > -1) { - // STYLE inline attribute - // then, parse for URIs - numberOfLinksExtracted.addAndGet(ExtractorCSS.processStyleCode( - this, curi, value)); - } else if (attr.start(12) > -1) { - // METHOD - method = value; - // form processing finished at end (after ACTION also collected) - } else if (attr.start(13) > -1) { - if("NAME".equalsIgnoreCase(attrName.toString())) { - // remember 'name' for end-analysis - nameVal = value; - } - if("FLASHVARS".equalsIgnoreCase(attrName.toString())) { - // consider FLASHVARS attribute immediately - valueContext = elementContext(element,attr.group(13)); - considerQueryStringValues(curi, value, valueContext,Hop.SPECULATIVE); - } - // any other attribute - // ignore for now - // could probe for path- or script-looking strings, but - // those should be vanishingly rare in other attributes, - // and/or symptomatic of page bugs - } - } - TextUtils.recycleMatcher(attr); - - // handle codebase/resources - if (resources != null) { - Iterator iter = resources.iterator(); - UURI codebaseURI = null; - String res = null; - try { - if (codebase != null) { - // TODO: Pass in the charset. - codebaseURI = UURIFactory. - getInstance(curi.getUURI(), codebase); - } - while(iter.hasNext()) { - res = iter.next().toString(); - res = (String) TextUtils.unescapeHtml(res); - if (codebaseURI != null) { - res = codebaseURI.resolve(res).toString(); - } - processEmbed(curi, res, element); // TODO: include attribute too - } - } catch (URIException e) { - curi.getNonFatalFailures().add(e); - } catch (IllegalArgumentException e) { - DevUtils.logger.log(Level.WARNING, "processGeneralTag()\n" + - "codebase=" + codebase + " res=" + res + "\n" + - DevUtils.extraInfo(), e); - } - } - - // finish handling form action, now method is available - if(action != null) { - if(method == null || "GET".equalsIgnoreCase(method.toString()) - || ! getExtractOnlyFormGets()) { - processLink(curi, action, actionContext); - } - } - - // finish handling VALUE - if(valueVal != null) { - if ("PARAM".equalsIgnoreCase(elementStr) && nameVal != null - && "flashvars".equalsIgnoreCase(nameVal.toString())) { - // special handling for 0) { - curi.getNonFatalFailures().add(cs.getCodingException()); - } - // Set flag to indicate that link extraction is completed. - return true; - } catch (IOException e) { - curi.getNonFatalFailures().add(e); - logger.log(Level.WARNING,"Failed get of replay char sequence in " + - Thread.currentThread().getName(), e); - } - return false; - } - - // 1. look for - // 2. if not found then look for - // 3. if not found then - protected Charset getContentDeclaredCharset(CrawlURI curi, String contentPrefix) { - String charsetName = null; - // - Matcher matcher = TextUtils.getMatcher("(?is)]*http-equiv\\s*=\\s*['\"]content-type['\"][^>]*>", contentPrefix); - if (matcher.find()) { - String metaContentType = matcher.group(); - TextUtils.recycleMatcher(matcher); - matcher = TextUtils.getMatcher("charset=([^'\";\\s>]+)", metaContentType); - if (matcher.find()) { - charsetName = matcher.group(1); - } - TextUtils.recycleMatcher(matcher); - } - - if(charsetName==null) { - // - matcher = TextUtils.getMatcher("(?si)]*charset=['\"]([^'\";\\s>]+)['\"]", contentPrefix); - if (matcher.find()) { - charsetName = matcher.group(1); - TextUtils.recycleMatcher(matcher); - } else { - // - matcher = TextUtils.getMatcher("(?is)<\\?xml\\s+[^>]*encoding=['\"]([^'\"]+)['\"]", contentPrefix); - if (matcher.find()) { - charsetName = matcher.group(1); - } else { - return null; // none found - } - TextUtils.recycleMatcher(matcher); - } - } - try { - return Charset.forName(charsetName); - } catch (IllegalArgumentException iae) { - logger.log(Level.INFO,"Unknown content-encoding '"+charsetName+"' declared; using default"); - curi.getAnnotations().add("unsatisfiableCharsetInHTML:"+charsetName); - return null; - } - } - - /** - * Run extractor. - * This method is package visible to ease testing. - * @param curi CrawlURI we're processing. - * @param cs Sequence from underlying ReplayCharSequence. This - * is TRANSIENT data. Make a copy if you want the data to live outside - * of this extractors' lifetime. - */ - protected void extract(CrawlURI curi, CharSequence cs) { - Matcher tags = TextUtils.getMatcher(relevantTagPattern,cs); - while(tags.find()) { - if(Thread.interrupted()){ - break; - } - if (tags.start(8) > 0) { - // comment match - // for now do nothing - } else if (tags.start(7) > 0) { - // match - int start = tags.start(5); - int end = tags.end(5); - assert start >= 0: "Start is: " + start + ", " + curi; - assert end >= 0: "End is :" + end + ", " + curi; - if (processMeta(curi, - cs.subSequence(start, end))) { - - // meta tag included NOFOLLOW; abort processing - break; - } - } else if (tags.start(5) > 0) { - // generic match - int start5 = tags.start(5); - int end5 = tags.end(5); - assert start5 >= 0: "Start is: " + start5 + ", " + curi; - assert end5 >= 0: "End is :" + end5 + ", " + curi; - int start6 = tags.start(6); - int end6 = tags.end(6); - assert start6 >= 0: "Start is: " + start6 + ", " + curi; - assert end6 >= 0: "End is :" + end6 + ", " + curi; - String element = cs.subSequence(start6, end6).toString(); - CharSequence attributes = cs.subSequence(start5, end5); - processGeneralTag(curi, - element, - attributes); - // remember FORM to help later extra processing - if ("form".equalsIgnoreCase(element)) { - curi.getDataList(A_FORM_OFFSETS).add((Integer)(start6-1)); - } - - - } else if (tags.start(1) > 0) { - //