Skip to content

Commit 1446d35

Browse files
Merge upstream 'master' branch from 'iipc/webarchive-commons'
2 parents e10c525 + a8fd8a7 commit 1446d35

File tree

5 files changed

+69
-6
lines changed

5 files changed

+69
-6
lines changed

CHANGES.md

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,40 @@
1+
1.3.0
2+
-----
3+
4+
#### URL Canonicalization Changed
5+
6+
The output of WaybackURLKeyMaker and other canonicalizers based on BasicURLCanonicalizer has changed for URLs that
7+
contain non UTF-8 percent encoded sequences. For example when a URL contains "%C3%23" it will now be normalised to
8+
"%c3%23" whereas previous releases produced "%25c3%23". This change brings webarchive-commons more inline with pywb,
9+
surt (Python), warcio.js and RFC 3986. While CDX file compatibility with these newer tools should improve, note that CDX
10+
files generated by the new release which contain such URLs may not work correctly with existing versions of
11+
OpenWayback that use the older webarchive-commons. [#102](https://github.com/iipc/webarchive-commons/pull/102)
12+
13+
#### Bug fixes
14+
15+
* WAT: Duplicated payload metadata values for "Actual-Content-Length" and "Trailing-Slop-Length" [#103](https://github.com/iipc/webarchive-commons/pull/103)
16+
* ObjectPlusFilesOutputStream.hardlinkOrCopy now uses `Files.createLink()` instead of executing `ln`. This
17+
prevents the potential for security vulnerabilities from command line option injection and improves portability.
18+
19+
#### Dependency upgrades
20+
21+
* fastutil removed
22+
* dsiutils removed
23+
24+
#### Deprecations
25+
26+
The following classes and enum members have been marked deprecated as a step towards removal of the dependency on
27+
Apache Commons HttpClient 3.1.
28+
29+
* org.archive.httpclient.HttpRecorderGetMethod
30+
* org.archive.httpclient.HttpRecorderMethod
31+
* org.archive.httpclient.HttpRecorderPostMethod
32+
* org.archive.httpclient.SingleHttpConnectionManager
33+
* org.archive.httpclient.ThreadLocalHttpConnectionManager
34+
* org.archive.util.binsearch.impl.http.ApacheHttp31SLR
35+
* org.archive.util.binsearch.impl.http.ApacheHttp31SLRFactory
36+
* org.archive.util.binsearch.impl.http.HTTPSeekableLineReaderFactory.HttpLibs.APACHE_31
37+
138
1.2.0
239
-----
340

pom.xml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44
<groupId>org.commoncrawl</groupId>
55
<artifactId>ia-web-commons</artifactId>
6-
<version>1.2.1-SNAPSHOT</version>
6+
<version>1.3.1-SNAPSHOT</version>
77
<packaging>jar</packaging>
88

99
<name>ia-web-commons</name>

src/main/java/org/archive/url/BasicURLCanonicalizer.java

Lines changed: 22 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,18 +15,18 @@
1515
/**
1616
* Canonicalizer that does more or less basic fixup. Based initially on rules
1717
* specified at <a href=
18-
* "https://developers.google.com/safe-browsing/developers_guide_v2#Canonicalization"
18+
* "https://web.archive.org/web/20130306015559/https://developers.google.com/safe-browsing/developers_guide_v2#Canonicalization"
1919
* >https://developers.google.com/safe-browsing/developers_guide_v2#
20-
* Canonicalization</a>. These rules are designed for clients of google's
20+
* Canonicalization</a>. These rules are designed for clients of Google's
2121
* "experimental" Safe Browsing API to "check URLs against Google's
2222
* constantly-updated blacklists of suspected phishing and malware pages".
2323
*
2424
* <p>
25-
* This class differs from google in treatment of non-ascii input. Google's
25+
* This class differs from Google in treatment of non-ascii input. Google's
2626
* rules don't really address this except with one example test case, which
2727
* seems to suggest taking raw input bytes and pct-encoding them byte for byte.
2828
* Since the input to this class consists of java strings, not raw bytes, that
29-
* wouldn't be possible, even if deemed preferable. Instead
29+
* wouldn't be possible, even if deemed preferable. Instead,
3030
* BasicURLCanonicalizer expresses non-ascii characters pct-encoded UTF-8.
3131
*/
3232
public class BasicURLCanonicalizer implements URLCanonicalizer {
@@ -212,6 +212,10 @@ protected static Charset UTF8() {
212212
return _UTF8;
213213
}
214214

215+
/**
216+
* @param input String to be percent-encoded. Assumed to be fully unescaped.
217+
* @return percent-encoded string
218+
*/
215219
public String escapeOnce(String input) {
216220
if (input == null) {
217221
return null;
@@ -243,6 +247,19 @@ public String escapeOnce(String input) {
243247
*/
244248
sb = new StringBuilder(input.substring(0, i));
245249
}
250+
if (b == '%' && i < utf8bytes.length - 2) {
251+
// Any hex escapes left at this point represent non-UTF-8 encoded characters
252+
// Unescape them, so they don't get double escaped
253+
int hex1 = getHex(utf8bytes[i + 1]);
254+
if (hex1 >= 0) {
255+
int hex2 = getHex(utf8bytes[i + 2]);
256+
if (hex2 >= 0) {
257+
i = i+2;
258+
b = hex1 * 16 + hex2;
259+
}
260+
}
261+
262+
}
246263
sb.append("%");
247264
String hex = Integer.toHexString(b).toUpperCase();
248265
if (hex.length() == 1) {
@@ -337,7 +354,7 @@ public String decode(String input) {
337354
* Decodes bytes in bbuf as utf-8 and appends decoded characters to sb. If
338355
* decoding of any portion fails, appends the un-decodable %xx%xx sequence
339356
* extracted from inputStr instead of decoded characters. See "bad unicode"
340-
* tests in GoogleCanonicalizerTest#testDecode(). Variables only make sense
357+
* tests in BasicURLCanonicalizerTest#testDecode(). Variables only make sense
341358
* within context of {@link #decode(String)}.
342359
*
343360
* @param sb

src/test/java/org/archive/url/BasicURLCanonicalizerTest.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,9 @@ public void testUnescapeRepeatedly() {
143143
assertEquals("%",guc.unescapeRepeatedly("%25%32%35"));
144144

145145
assertEquals("168.188.99.26",guc.unescapeRepeatedly("%31%36%38%2e%31%38%38%2e%39%39%2e%32%36"));
146+
147+
assertEquals("tag=%E4%EE%F8%EA%EE%EB%FC%ED%EE%E5",
148+
guc.unescapeRepeatedly("tag=%E4%EE%F8%EA%EE%EB%FC%ED%EE%E5"));
146149
}
147150

148151
public void testAttemptIPFormats() throws URIException {

src/test/java/org/archive/url/WaybackURLKeyMakerTest.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,12 @@ public void testMakeKey() throws URISyntaxException {
2626
assertEquals("192,211,203,34)/robots.txt", km.makeKey("https://34.203.211.192/robots.txt"));
2727
assertEquals("2600:1f18:200d:fb00:2b74:867c:ab0c:150a)/robots.txt",
2828
km.makeKey("https://[2600:1f18:200d:fb00:2b74:867c:ab0c:150a]/robots.txt"));
29+
assertEquals("ua,1kr)/newslist.html?tag=%e4%ee%f8%ea%ee%eb%fc%ed%ee%e5",
30+
km.makeKey("http://1kr.ua/newslist.html?tag=%E4%EE%F8%EA%EE%EB%FC%ED%EE%E5"));
31+
assertEquals("com,aluroba)/tags/%c3%ce%ca%c7%d1%e5%c7.htm",
32+
km.makeKey("http://www.aluroba.com/tags/%C3%CE%CA%C7%D1%E5%C7.htm"));
33+
assertEquals("ac,insbase)/xoops2/modules/xpwiki?%a4%d5%a4%af%a4%aa%a4%ab%b8%a9%a4%aa%a4%aa%a4%ce%a4%b8%a4%e7%a4%a6%bb%d4",
34+
km.makeKey("https://www.insbase.ac/xoops2/modules/xpwiki/?%A4%D5%A4%AF%A4%AA%A4%AB%B8%A9%A4%AA%A4%AA%A4%CE%A4%B8%A4%E7%A4%A6%BB%D4"));
2935
}
3036

3137
}

0 commit comments

Comments
 (0)