diff --git a/core/src/main/java/com/digitalpebble/stormcrawler/protocol/AbstractHttpProtocol.java b/core/src/main/java/com/digitalpebble/stormcrawler/protocol/AbstractHttpProtocol.java index c86ec92fe..5262f418d 100644 --- a/core/src/main/java/com/digitalpebble/stormcrawler/protocol/AbstractHttpProtocol.java +++ b/core/src/main/java/com/digitalpebble/stormcrawler/protocol/AbstractHttpProtocol.java @@ -47,6 +47,8 @@ public abstract class AbstractHttpProtocol implements Protocol { protected boolean useCookies = false; + protected List protocolVersions; + protected static final String RESPONSE_COOKIES_HEADER = "set-cookie"; protected String protocolMDprefix = ""; @@ -58,6 +60,8 @@ public void configure(Config conf) { this.storeHTTPHeaders = ConfUtils.getBoolean(conf, "http.store.headers", false); this.useCookies = ConfUtils.getBoolean(conf, "http.use.cookies", false); + this.protocolVersions = ConfUtils + .loadListFromConf("http.protocol.versions", conf); robots = new HttpRobotRulesParser(conf); protocolMDprefix = ConfUtils.getString(conf, ProtocolResponse.PROTOCOL_MD_PREFIX_PARAM, protocolMDprefix); diff --git a/core/src/main/java/com/digitalpebble/stormcrawler/protocol/okhttp/HttpProtocol.java b/core/src/main/java/com/digitalpebble/stormcrawler/protocol/okhttp/HttpProtocol.java index f3d3107b1..75b2b1378 100644 --- a/core/src/main/java/com/digitalpebble/stormcrawler/protocol/okhttp/HttpProtocol.java +++ b/core/src/main/java/com/digitalpebble/stormcrawler/protocol/okhttp/HttpProtocol.java @@ -23,6 +23,7 @@ import java.net.Proxy; import java.net.URL; import java.security.cert.CertificateException; +import java.util.ArrayList; import java.util.Base64; import java.util.LinkedList; import java.util.List; @@ -137,6 +138,37 @@ public void configure(Config conf) { .writeTimeout(timeout, TimeUnit.MILLISECONDS) .readTimeout(timeout, TimeUnit.MILLISECONDS); + // protocols in order of preference, see + // https://square.github.io/okhttp/4.x/okhttp/okhttp3/-ok-http-client/-builder/protocols/ + List protocols = new ArrayList<>(); + for (String pVersion : protocolVersions) { + switch(pVersion) { + case "h2": + protocols.add(okhttp3.Protocol.HTTP_2); + break; + case "h2c": + if (protocolVersions.size() > 1) { + LOG.error("h2c ignored, it cannot be combined with any other protocol"); + } else { + protocols.add(okhttp3.Protocol.H2_PRIOR_KNOWLEDGE); + } + break; + case "http/1.1": + protocols.add(okhttp3.Protocol.HTTP_1_1); + break; + case "http/1.0": + LOG.warn("http/1.0 ignored, not supported by okhttp for requests"); + break; + default: + LOG.error("{}: unknown protocol version", pVersion); + break; + } + } + if (protocols.size() > 0) { + LOG.info("Using protocol versions: {}", protocols); + builder.protocols(protocols); + } + String userAgent = getAgentString(conf); if (StringUtils.isNotBlank(userAgent)) { customRequestHeaders.add(new String[] { "User-Agent", userAgent }); diff --git a/core/src/main/java/com/digitalpebble/stormcrawler/util/ConfUtils.java b/core/src/main/java/com/digitalpebble/stormcrawler/util/ConfUtils.java index 722716382..2963232b0 100644 --- a/core/src/main/java/com/digitalpebble/stormcrawler/util/ConfUtils.java +++ b/core/src/main/java/com/digitalpebble/stormcrawler/util/ConfUtils.java @@ -22,6 +22,7 @@ import java.io.InputStreamReader; import java.nio.charset.Charset; import java.util.ArrayList; +import java.util.Collection; import java.util.HashMap; import java.util.LinkedList; import java.util.List; @@ -88,6 +89,8 @@ public static List loadListFromConf(String paramKey, Map stormConf) { if (obj instanceof PersistentVector) { list.addAll((PersistentVector) obj); + } else if (obj instanceof Collection) { + list.addAll((Collection) obj); } else { // single value? list.add(obj.toString()); } diff --git a/core/src/main/resources/crawler-default.yaml b/core/src/main/resources/crawler-default.yaml index bf65e0dcc..0c1bc34a6 100644 --- a/core/src/main/resources/crawler-default.yaml +++ b/core/src/main/resources/crawler-default.yaml @@ -113,6 +113,21 @@ config: https.protocol.implementation: "com.digitalpebble.stormcrawler.protocol.httpclient.HttpProtocol" file.protocol.implementation: "com.digitalpebble.stormcrawler.protocol.file.FileProtocol" + # the http/https protocol versions to use, in order of preference + # Details of the protocol negotiation between the client and + # the crawled server depend on the chosen protocol implementation. + # If no protocol versions are listed the protocol implementation + # will use its defaults. + http.protocol.versions: + # HTTP/2 over TLS (protocol negotiation via ALPN) + #- "h2" + # HTTP/1.1 + #- "http/1.1" + # HTTP/1.0 + #- "http/1.0" + # HTTP/2 over TCP + ##- "h2c" + # key values obtained by the protocol can be prefixed # to avoid accidental overwrites. Note that persisted # or transferred protocol metadata must also be prefixed. diff --git a/external/warc/README.md b/external/warc/README.md index b90705511..bd5641d2d 100644 --- a/external/warc/README.md +++ b/external/warc/README.md @@ -153,6 +153,12 @@ Writing complete and valid WARC requires that HTTP headers, IP address and captu https.protocol.implementation: com.digitalpebble.stormcrawler.protocol.okhttp.HttpProtocol ``` +Until the WARC bolt can write HTTP/2 requests and response in a way compatible with most WARC readers (see #828), HTTP/1.1 should be used by setting: +``` + http.protocol.versions: + - "http/1.1" +``` + ## Consuming WARC files