|
15 | 15 | /** |
16 | 16 | * Canonicalizer that does more or less basic fixup. Based initially on rules |
17 | 17 | * specified at <a href= |
18 | | - * "https://developers.google.com/safe-browsing/developers_guide_v2#Canonicalization" |
| 18 | + * "https://web.archive.org/web/20130306015559/https://developers.google.com/safe-browsing/developers_guide_v2#Canonicalization" |
19 | 19 | * >https://developers.google.com/safe-browsing/developers_guide_v2# |
20 | | - * Canonicalization</a>. These rules are designed for clients of google's |
| 20 | + * Canonicalization</a>. These rules are designed for clients of Google's |
21 | 21 | * "experimental" Safe Browsing API to "check URLs against Google's |
22 | 22 | * constantly-updated blacklists of suspected phishing and malware pages". |
23 | 23 | * |
24 | 24 | * <p> |
25 | | - * This class differs from google in treatment of non-ascii input. Google's |
| 25 | + * This class differs from Google in treatment of non-ascii input. Google's |
26 | 26 | * rules don't really address this except with one example test case, which |
27 | 27 | * seems to suggest taking raw input bytes and pct-encoding them byte for byte. |
28 | 28 | * Since the input to this class consists of java strings, not raw bytes, that |
29 | | - * wouldn't be possible, even if deemed preferable. Instead |
| 29 | + * wouldn't be possible, even if deemed preferable. Instead, |
30 | 30 | * BasicURLCanonicalizer expresses non-ascii characters pct-encoded UTF-8. |
31 | 31 | */ |
32 | 32 | public class BasicURLCanonicalizer implements URLCanonicalizer { |
@@ -212,6 +212,10 @@ protected static Charset UTF8() { |
212 | 212 | return _UTF8; |
213 | 213 | } |
214 | 214 |
|
| 215 | + /** |
| 216 | + * @param input String to be percent-encoded. Assumed to be fully unescaped. |
| 217 | + * @return percent-encoded string |
| 218 | + */ |
215 | 219 | public String escapeOnce(String input) { |
216 | 220 | if (input == null) { |
217 | 221 | return null; |
@@ -243,6 +247,19 @@ public String escapeOnce(String input) { |
243 | 247 | */ |
244 | 248 | sb = new StringBuilder(input.substring(0, i)); |
245 | 249 | } |
| 250 | + if (b == '%' && i < utf8bytes.length - 2) { |
| 251 | + // Any hex escapes left at this point represent non-UTF-8 encoded characters |
| 252 | + // Unescape them, so they don't get double escaped |
| 253 | + int hex1 = getHex(utf8bytes[i + 1]); |
| 254 | + if (hex1 >= 0) { |
| 255 | + int hex2 = getHex(utf8bytes[i + 2]); |
| 256 | + if (hex2 >= 0) { |
| 257 | + i = i+2; |
| 258 | + b = hex1 * 16 + hex2; |
| 259 | + } |
| 260 | + } |
| 261 | + |
| 262 | + } |
246 | 263 | sb.append("%"); |
247 | 264 | String hex = Integer.toHexString(b).toUpperCase(); |
248 | 265 | if (hex.length() == 1) { |
@@ -337,7 +354,7 @@ public String decode(String input) { |
337 | 354 | * Decodes bytes in bbuf as utf-8 and appends decoded characters to sb. If |
338 | 355 | * decoding of any portion fails, appends the un-decodable %xx%xx sequence |
339 | 356 | * extracted from inputStr instead of decoded characters. See "bad unicode" |
340 | | - * tests in GoogleCanonicalizerTest#testDecode(). Variables only make sense |
| 357 | + * tests in BasicURLCanonicalizerTest#testDecode(). Variables only make sense |
341 | 358 | * within context of {@link #decode(String)}. |
342 | 359 | * |
343 | 360 | * @param sb |
|
0 commit comments