Skip to content

Commit

Permalink
HTTP protocol implementation: allow to configure which protocol versi…
Browse files Browse the repository at this point in the history
…on(s) to use

implements apache#827
- configuration key `http.protocol.versions` holds a list of protocols
  in order of preference
- implement selection of protocols in okhttp protocol implementation
- fix loading of YAML lists when configuration isn't loaded via Storm
  (eg. by main method of AbstractHttpProtocol)
  • Loading branch information
sebastian-nagel committed Oct 5, 2020
1 parent 3dcacfa commit c001b62
Show file tree
Hide file tree
Showing 5 changed files with 60 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ public abstract class AbstractHttpProtocol implements Protocol {

protected boolean useCookies = false;

protected List<String> protocolVersions;

protected static final String RESPONSE_COOKIES_HEADER = "set-cookie";

protected String protocolMDprefix = "";
Expand All @@ -58,6 +60,8 @@ public void configure(Config conf) {
this.storeHTTPHeaders = ConfUtils.getBoolean(conf,
"http.store.headers", false);
this.useCookies = ConfUtils.getBoolean(conf, "http.use.cookies", false);
this.protocolVersions = ConfUtils
.loadListFromConf("http.protocol.versions", conf);
robots = new HttpRobotRulesParser(conf);
protocolMDprefix = ConfUtils.getString(conf,
ProtocolResponse.PROTOCOL_MD_PREFIX_PARAM, protocolMDprefix);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
import java.net.Proxy;
import java.net.URL;
import java.security.cert.CertificateException;
import java.util.ArrayList;
import java.util.Base64;
import java.util.LinkedList;
import java.util.List;
Expand Down Expand Up @@ -137,6 +138,37 @@ public void configure(Config conf) {
.writeTimeout(timeout, TimeUnit.MILLISECONDS)
.readTimeout(timeout, TimeUnit.MILLISECONDS);

// protocols in order of preference, see
// https://square.github.io/okhttp/4.x/okhttp/okhttp3/-ok-http-client/-builder/protocols/
List<okhttp3.Protocol> protocols = new ArrayList<>();
for (String pVersion : protocolVersions) {
switch(pVersion) {
case "h2":
protocols.add(okhttp3.Protocol.HTTP_2);
break;
case "h2c":
if (protocolVersions.size() > 1) {
LOG.error("h2c ignored, it cannot be combined with any other protocol");
} else {
protocols.add(okhttp3.Protocol.H2_PRIOR_KNOWLEDGE);
}
break;
case "http/1.1":
protocols.add(okhttp3.Protocol.HTTP_1_1);
break;
case "http/1.0":
LOG.warn("http/1.0 ignored, not supported by okhttp for requests");
break;
default:
LOG.error("{}: unknown protocol version", pVersion);
break;
}
}
if (protocols.size() > 0) {
LOG.info("Using protocol versions: {}", protocols);
builder.protocols(protocols);
}

String userAgent = getAgentString(conf);
if (StringUtils.isNotBlank(userAgent)) {
customRequestHeaders.add(new String[] { "User-Agent", userAgent });
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
Expand Down Expand Up @@ -88,6 +89,8 @@ public static List<String> loadListFromConf(String paramKey, Map stormConf) {

if (obj instanceof PersistentVector) {
list.addAll((PersistentVector) obj);
} else if (obj instanceof Collection) {
list.addAll((Collection<String>) obj);
} else { // single value?
list.add(obj.toString());
}
Expand Down
15 changes: 15 additions & 0 deletions core/src/main/resources/crawler-default.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,21 @@ config:
https.protocol.implementation: "com.digitalpebble.stormcrawler.protocol.httpclient.HttpProtocol"
file.protocol.implementation: "com.digitalpebble.stormcrawler.protocol.file.FileProtocol"

# the http/https protocol versions to use, in order of preference
# Details of the protocol negotiation between the client and
# the crawled server depend on the chosen protocol implementation.
# If no protocol versions are listed the protocol implementation
# will use its defaults.
http.protocol.versions:
# HTTP/2 over TLS (protocol negotiation via ALPN)
#- "h2"
# HTTP/1.1
#- "http/1.1"
# HTTP/1.0
#- "http/1.0"
# HTTP/2 over TCP
##- "h2c"

# key values obtained by the protocol can be prefixed
# to avoid accidental overwrites. Note that persisted
# or transferred protocol metadata must also be prefixed.
Expand Down
6 changes: 6 additions & 0 deletions external/warc/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,12 @@ Writing complete and valid WARC requires that HTTP headers, IP address and captu
https.protocol.implementation: com.digitalpebble.stormcrawler.protocol.okhttp.HttpProtocol
```

Until the WARC bolt can write HTTP/2 requests and response in a way compatible with most WARC readers (see #828), HTTP/1.1 should be used by setting:
```
http.protocol.versions:
- "http/1.1"
```


## Consuming WARC files

Expand Down

0 comments on commit c001b62

Please sign in to comment.