Merge upstream 'master' branch from 'iipc/webarchive-commons'

sebastian-nagel · sebastian-nagel · commit 1446d357b150 · 2025-01-08T16:43:56.000+01:00
diff --git a/CHANGES.md b/CHANGES.md
@@ -1,3 +1,40 @@
+1.3.0
+-----
+
+#### URL Canonicalization Changed
+
+The output of WaybackURLKeyMaker and other canonicalizers based on BasicURLCanonicalizer has changed for URLs that
+contain non UTF-8 percent encoded sequences. For example when a URL contains "%C3%23" it will now be normalised to
+"%c3%23" whereas previous releases produced "%25c3%23". This change brings webarchive-commons more inline with pywb,
+surt (Python), warcio.js and RFC 3986. While CDX file compatibility with these newer tools should improve, note that CDX
+files generated by the new release which contain such URLs may not work correctly with existing versions of 
+OpenWayback that use the older webarchive-commons. [#102](https://github.com/iipc/webarchive-commons/pull/102)
+
+#### Bug fixes
+
+* WAT: Duplicated payload metadata values for "Actual-Content-Length" and "Trailing-Slop-Length" [#103](https://github.com/iipc/webarchive-commons/pull/103)
+* ObjectPlusFilesOutputStream.hardlinkOrCopy now uses `Files.createLink()` instead of executing `ln`. This
+  prevents the potential for security vulnerabilities from command line option injection and improves portability.
+
+#### Dependency upgrades
+
+* fastutil removed
+* dsiutils removed
+
+#### Deprecations
+
+The following classes and enum members have been marked deprecated as a step towards removal of the dependency on
+Apache Commons HttpClient 3.1.
+
+* org.archive.httpclient.HttpRecorderGetMethod
+* org.archive.httpclient.HttpRecorderMethod
+* org.archive.httpclient.HttpRecorderPostMethod
+* org.archive.httpclient.SingleHttpConnectionManager
+* org.archive.httpclient.ThreadLocalHttpConnectionManager
+* org.archive.util.binsearch.impl.http.ApacheHttp31SLR
+* org.archive.util.binsearch.impl.http.ApacheHttp31SLRFactory
+* org.archive.util.binsearch.impl.http.HTTPSeekableLineReaderFactory.HttpLibs.APACHE_31
+
 1.2.0
 -----
 
diff --git a/pom.xml b/pom.xml
@@ -3,7 +3,7 @@
 
   <groupId>org.commoncrawl</groupId>
   <artifactId>ia-web-commons</artifactId>
-  <version>1.2.1-SNAPSHOT</version>
+  <version>1.3.1-SNAPSHOT</version>
   <packaging>jar</packaging>
 
   <name>ia-web-commons</name>
diff --git a/src/main/java/org/archive/url/BasicURLCanonicalizer.java b/src/main/java/org/archive/url/BasicURLCanonicalizer.java
@@ -15,18 +15,18 @@
 /**
  * Canonicalizer that does more or less basic fixup. Based initially on rules
  * specified at <a href=
- * "https://developers.google.com/safe-browsing/developers_guide_v2#Canonicalization"
+ * "https://web.archive.org/web/20130306015559/https://developers.google.com/safe-browsing/developers_guide_v2#Canonicalization"
  * >https://developers.google.com/safe-browsing/developers_guide_v2#
- * Canonicalization</a>. These rules are designed for clients of google's
+ * Canonicalization</a>. These rules are designed for clients of Google's
  * "experimental" Safe Browsing API to "check URLs against Google's
  * constantly-updated blacklists of suspected phishing and malware pages".
  * 
  * <p>
- * This class differs from google in treatment of non-ascii input. Google's
+ * This class differs from Google in treatment of non-ascii input. Google's
  * rules don't really address this except with one example test case, which
  * seems to suggest taking raw input bytes and pct-encoding them byte for byte.
  * Since the input to this class consists of java strings, not raw bytes, that
- * wouldn't be possible, even if deemed preferable. Instead
+ * wouldn't be possible, even if deemed preferable. Instead,
  * BasicURLCanonicalizer expresses non-ascii characters pct-encoded UTF-8.
  */
 public class BasicURLCanonicalizer implements URLCanonicalizer {
@@ -212,6 +212,10 @@ protected static Charset UTF8() {
 		return _UTF8;
 	}
 
+	/**
+	 * @param input String to be percent-encoded. Assumed to be fully unescaped.
+	 * @return percent-encoded string
+	 */
 	public String escapeOnce(String input) {
 		if (input == null) {
 			return null;
@@ -243,6 +247,19 @@ public String escapeOnce(String input) {
 					 */
 					sb = new StringBuilder(input.substring(0, i));
 				}
+				if (b == '%' && i < utf8bytes.length - 2) {
+					// Any hex escapes left at this point represent non-UTF-8 encoded characters
+					// Unescape them, so they don't get double escaped
+					int hex1 = getHex(utf8bytes[i + 1]);
+					if (hex1 >= 0) {
+						int hex2 = getHex(utf8bytes[i + 2]);
+						if (hex2 >= 0) {
+							i = i+2;
+							b = hex1 * 16 + hex2;
+						}
+					}
+
+				}
 				sb.append("%");
 				String hex = Integer.toHexString(b).toUpperCase();
 				if (hex.length() == 1) {
@@ -337,7 +354,7 @@ public String decode(String input) {
 	 * Decodes bytes in bbuf as utf-8 and appends decoded characters to sb. If
 	 * decoding of any portion fails, appends the un-decodable %xx%xx sequence
 	 * extracted from inputStr instead of decoded characters. See "bad unicode"
-	 * tests in GoogleCanonicalizerTest#testDecode(). Variables only make sense
+	 * tests in BasicURLCanonicalizerTest#testDecode(). Variables only make sense
 	 * within context of {@link #decode(String)}.
 	 * 
 	 * @param sb
diff --git a/src/test/java/org/archive/url/BasicURLCanonicalizerTest.java b/src/test/java/org/archive/url/BasicURLCanonicalizerTest.java
@@ -143,6 +143,9 @@ public void testUnescapeRepeatedly() {
 		assertEquals("%",guc.unescapeRepeatedly("%25%32%35"));
 		
 		assertEquals("168.188.99.26",guc.unescapeRepeatedly("%31%36%38%2e%31%38%38%2e%39%39%2e%32%36"));
+
+		assertEquals("tag=%E4%EE%F8%EA%EE%EB%FC%ED%EE%E5",
+				guc.unescapeRepeatedly("tag=%E4%EE%F8%EA%EE%EB%FC%ED%EE%E5"));
 	}
 	
 	public void testAttemptIPFormats() throws URIException {
diff --git a/src/test/java/org/archive/url/WaybackURLKeyMakerTest.java b/src/test/java/org/archive/url/WaybackURLKeyMakerTest.java
@@ -26,6 +26,12 @@ public void testMakeKey() throws URISyntaxException {
 		assertEquals("192,211,203,34)/robots.txt", km.makeKey("https://34.203.211.192/robots.txt"));
 		assertEquals("2600:1f18:200d:fb00:2b74:867c:ab0c:150a)/robots.txt",
 				km.makeKey("https://[2600:1f18:200d:fb00:2b74:867c:ab0c:150a]/robots.txt"));
+		assertEquals("ua,1kr)/newslist.html?tag=%e4%ee%f8%ea%ee%eb%fc%ed%ee%e5",
+				km.makeKey("http://1kr.ua/newslist.html?tag=%E4%EE%F8%EA%EE%EB%FC%ED%EE%E5"));
+		assertEquals("com,aluroba)/tags/%c3%ce%ca%c7%d1%e5%c7.htm",
+				km.makeKey("http://www.aluroba.com/tags/%C3%CE%CA%C7%D1%E5%C7.htm"));
+		assertEquals("ac,insbase)/xoops2/modules/xpwiki?%a4%d5%a4%af%a4%aa%a4%ab%b8%a9%a4%aa%a4%aa%a4%ce%a4%b8%a4%e7%a4%a6%bb%d4",
+			km.makeKey("https://www.insbase.ac/xoops2/modules/xpwiki/?%A4%D5%A4%AF%A4%AA%A4%AB%B8%A9%A4%AA%A4%AA%A4%CE%A4%B8%A4%E7%A4%A6%BB%D4"));
 	}
 
 }

Original file line number	Diff line number	Diff line change
`@@ -143,6 +143,9 @@ public void testUnescapeRepeatedly() {`
`143`	`143`	`assertEquals("%",guc.unescapeRepeatedly("%25%32%35"));`
`144`	`144`
`145`	`145`	`assertEquals("168.188.99.26",guc.unescapeRepeatedly("%31%36%38%2e%31%38%38%2e%39%39%2e%32%36"));`
	`146`	`+`
	`147`	`+ assertEquals("tag=%E4%EE%F8%EA%EE%EB%FC%ED%EE%E5",`
	`148`	`+ guc.unescapeRepeatedly("tag=%E4%EE%F8%EA%EE%EB%FC%ED%EE%E5"));`
`146`	`149`	`}`
`147`	`150`
`148`	`151`	`public void testAttemptIPFormats() throws URIException {`
Original file line number	Diff line number	Diff line change
`@@ -26,6 +26,12 @@ public void testMakeKey() throws URISyntaxException {`
`26`	`26`	`assertEquals("192,211,203,34)/robots.txt", km.makeKey("https://34.203.211.192/robots.txt"));`
`27`	`27`	`assertEquals("2600:1f18:200d:fb00:2b74:867c:ab0c:150a)/robots.txt",`
`28`	`28`	`km.makeKey("https://[2600:1f18:200d:fb00:2b74:867c:ab0c:150a]/robots.txt"));`
	`29`	`+ assertEquals("ua,1kr)/newslist.html?tag=%e4%ee%f8%ea%ee%eb%fc%ed%ee%e5",`
	`30`	`+ km.makeKey("http://1kr.ua/newslist.html?tag=%E4%EE%F8%EA%EE%EB%FC%ED%EE%E5"));`
	`31`	`+ assertEquals("com,aluroba)/tags/%c3%ce%ca%c7%d1%e5%c7.htm",`
	`32`	`+ km.makeKey("http://www.aluroba.com/tags/%C3%CE%CA%C7%D1%E5%C7.htm"));`
	`33`	`+ assertEquals("ac,insbase)/xoops2/modules/xpwiki?%a4%d5%a4%af%a4%aa%a4%ab%b8%a9%a4%aa%a4%aa%a4%ce%a4%b8%a4%e7%a4%a6%bb%d4",`
	`34`	`+ km.makeKey("https://www.insbase.ac/xoops2/modules/xpwiki/?%A4%D5%A4%AF%A4%AA%A4%AB%B8%A9%A4%AA%A4%AA%A4%CE%A4%B8%A4%E7%A4%A6%BB%D4"));`
`29`	`35`	`}`
`30`	`36`
`31`	`37`	`}`