Skip to content

Commit ba9dc70

Browse files
WAT: trim data URLs, fixes #48
Trim the data part of data URLs (https://www.rfc-editor.org/rfc/rfc2397) to reduce the size of WAT files. E.g., the URL  is trimmed to data:image/png;base64,
1 parent 1446d35 commit ba9dc70

File tree

2 files changed

+89
-14
lines changed

2 files changed

+89
-14
lines changed

src/main/java/org/archive/resource/html/ExtractingParseObserver.java

Lines changed: 52 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,7 @@ public void handleTagOpen(TagNode tag) {
179179
attrName = attrName.toLowerCase(Locale.ROOT);
180180
if (globalHrefAttributes.contains(attrName)) {
181181
attrValue = decodeCharEnt(attrValue);
182+
attrValue = trimDataUrl(attrValue);
182183
data.addHref(PATH,makePath(name,attrName),"url",attrValue);
183184
}
184185
}
@@ -382,24 +383,36 @@ private static void addBasicHrefs(HTMLMetaData data, TagNode node, String... att
382383
String val = node.getAttribute(attr);
383384
if(val != null) {
384385
val = decodeCharEnt(val);
386+
val = trimDataUrl(val);
385387
data.addHref(PATH,makePath(node.getTagName(),attr),"url",val);
386388
}
387389
}
388390
}
389391

390392
private static ArrayList<String> getAttrList(TagNode node, String... attrs) {
391393
ArrayList<String> l = new ArrayList<String>();
394+
boolean isOgImage = false;
392395
for(String attr : attrs) {
393396
String val = node.getAttribute(attr);
394397
if(val != null) {
395398
val = decodeCharEnt(val);
396399
l.add(attr);
397400
l.add(val);
401+
if (attr.equals("property") && val.equals("og:image")) {
402+
isOgImage = true;
403+
}
398404
}
399405
}
400406
if(l.size() == 0) {
401407
return null;
402408
}
409+
if (isOgImage) {
410+
// trim data: URLs in og:image metadata
411+
int content = l.indexOf("content");
412+
if (content > -1 && (content % 2) == 0) {
413+
l.set(content + 1, trimDataUrl(l.get(content + 1)));
414+
}
415+
}
403416
return l;
404417
}
405418

@@ -409,6 +422,7 @@ private static ArrayList<String> getAttrListUrl(TagNode node,
409422
ArrayList<String> l = null;
410423
if(url != null) {
411424
url = decodeCharEnt(url);
425+
url = trimDataUrl(url);
412426
l = new ArrayList<String>();
413427
l.add(PATH);
414428
l.add(makePath(node.getTagName(),urlAttr));
@@ -442,6 +456,7 @@ private static void addHrefsOnclick(HTMLMetaData data, TagNode node) {
442456
for (Pattern pattern : jsOnClickUrlPatterns) {
443457
String url = patternJSExtract(pattern, onclick);
444458
if (url != null) {
459+
url = trimDataUrl(url);
445460
data.addHref(PATH, path, "url", url);
446461
}
447462
}
@@ -483,6 +498,7 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
483498
if(url != null) {
484499
// got data:
485500
url = decodeCharEnt(url);
501+
url = trimDataUrl(url);
486502
l.add(PATH);
487503
l.add(makePath("A","href"));
488504
l.add("url");
@@ -520,6 +536,7 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
520536
String url = node.getAttribute("href");
521537
if(url != null) {
522538
url = decodeCharEnt(url);
539+
url = trimDataUrl(url);
523540
ArrayList<String> l = new ArrayList<String>();
524541
l.add(PATH);
525542
l.add(makePath("AREA","href"));
@@ -583,6 +600,7 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
583600
String url = node.getAttribute("action");
584601
if(url != null) {
585602
url = decodeCharEnt(url);
603+
url = trimDataUrl(url);
586604
// got data:
587605
l.add(PATH);
588606
l.add(makePath("FORM","action"));
@@ -728,7 +746,8 @@ private void patternCSSExtract(HTMLMetaData data, Pattern pattern, String conten
728746
String url = m.group(1);
729747
url = cssUrlTrimPattern.matcher(url).replaceAll("");
730748
if (!url.isEmpty()) {
731-
data.addHref("path","STYLE/#text","href", url);
749+
url = trimDataUrl(url);
750+
data.addHref("path", "STYLE/#text", "href", url);
732751
}
733752
}
734753
}
@@ -757,4 +776,36 @@ public static String decodeCharEnt(String text, boolean inAttribute) {
757776
return text;
758777
}
759778
}
779+
780+
/**
781+
* Trim data from
782+
* <a href="https://www.rfc-editor.org/rfc/rfc2397#section-2">data URLs</a>.
783+
*
784+
* Any data (after the comma) is trimmed from a data URL. If no comma is
785+
* found within the first 128 characters of the URL, the URL is trimmed to
786+
* 128 characters.
787+
*
788+
* @param url
789+
* URL to be trimmed
790+
* @return
791+
*/
792+
public static String trimDataUrl(String url) {
793+
if (url.startsWith("data:")) {
794+
int posComma = url.indexOf(',', 5);
795+
if (posComma == -1) {
796+
// no comma, trim to 128 characters if necessary
797+
if (url.length() > 128) {
798+
return url.substring(0, 128);
799+
}
800+
return url;
801+
} else if (posComma > 128) {
802+
return url.substring(0, 128);
803+
} else if (posComma == 6) {
804+
return "data:,";
805+
} else if (posComma > 6) {
806+
return url.substring(0, posComma + 1);
807+
}
808+
}
809+
return url;
810+
}
760811
}

src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java

Lines changed: 37 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -60,18 +60,24 @@ public void testHandleStyleNodeExceptions() throws Exception {
6060
}
6161

6262
public void testHandleStyleNode() throws Exception {
63-
String[][] tests = {
64-
{""},
65-
{"url(foo.gif)","foo.gif"},
66-
{"url('foo.gif')","foo.gif"},
67-
{"url(\"foo.gif\")","foo.gif"},
68-
{"url(\\\"foo.gif\\\")","foo.gif"},
69-
{"url(\\'foo.gif\\')","foo.gif"},
70-
{"url(''foo.gif'')","foo.gif"},
71-
{"url( foo.gif )","foo.gif"},
72-
{"url('''')"},
73-
{"url('foo.gif'')","foo.gif"},
74-
};
63+
String[][] tests = { //
64+
{""}, //
65+
{"url(foo.gif)","foo.gif"}, //
66+
{"url('foo.gif')","foo.gif"}, //
67+
{"url(\"foo.gif\")","foo.gif"}, //
68+
{"url(\\\"foo.gif\\\")","foo.gif"}, //
69+
{"url(\\'foo.gif\\')","foo.gif"}, //
70+
{"url(''foo.gif'')","foo.gif"}, //
71+
{"url( foo.gif )","foo.gif"}, //
72+
{"url('''')"}, //
73+
{"url('foo.gif'')","foo.gif"}, //
74+
{"url('')","data:image/png;base64,"}, //
75+
{"url(\"data:image/svg+xml,%3Csvg%20xmlns=%22http://www.w3.org/2000/svg%22%20viewBox=%220%200%2080%2080%22%3E%3C/svg%3E\")",
76+
"data:image/svg+xml," },
77+
// would fail: the pattern extractor stops at the first white space in the data URL
78+
// {"background-image: url('data:image/svg+xml,%3Csvg xmlns=\"http://www.w3.org/2000/svg\" viewBox=\"0 0 40 40\"%3E%3Ccircle r=\"18\" cx=\"20\" cy=\"20\" fill=\"red\" /%3E%3C/svg%3E');\n",
79+
// "data:image/svg+xml," },
80+
};
7581
for(String[] testa : tests) {
7682
checkExtract(testa);
7783
}
@@ -125,7 +131,7 @@ private void checkExtract(String[] data) throws JSONException {
125131
}
126132
JSONArray a = md.optJSONArray("Links");
127133
if(data.length > 1) {
128-
assertNotNull(a);
134+
assertNotNull("CSS link extraction failed for <" + css + ">", a);
129135
assertEquals(data.length-1,a.length());
130136
for(int i = 1; i < data.length; i++) {
131137
Object o = a.optJSONObject(i-1);
@@ -531,4 +537,22 @@ public void testHtmlParserEntityDecoding() {
531537
}
532538
}
533539

540+
public void testTrimDataURLs() {
541+
String[][] urls = { //
542+
{ "", "data:image/png;base64," }, //
543+
{ "data:image/svg+xml,%3Csvg%20xmlns=%22http://www.w3.org/2000/svg%22%20viewBox=%220%200%2080%2080%22%3E%3C/svg%3E",
544+
"data:image/svg+xml," }, //
545+
{ "data:image/svg+xml,%3Csvg xmlns=\"http://www.w3.org/2000/svg\" viewBox=\"0 0 40 40\"%3E%3Ccircle r=\"18\" cx=\"20\" cy=\"20\" fill=\"red\" /%3E%3C/svg%3E",
546+
"data:image/svg+xml," }, //
547+
{ "data:image/svg+xml;utf9,<svg%20version='1.1'%20xmlns='http://www.w3.org/2000/svg'><filter%20id='blur'><feGaussianBlur%20stdDeviation='10'%20/></filter></svg>#blur",
548+
"data:image/svg+xml;utf9," }, //
549+
{ "data:application/font-woff;charset=utf-8;base64,d09GRgABAAAAAAUQAA0AAAAA",
550+
"data:application/font-woff;charset=utf-8;base64," }, //
551+
{ "data:text/plain;charset=iso-8859-7,%be%fg%be", "data:text/plain;charset=iso-8859-7," }, //
552+
};
553+
for (String[] url : urls) {
554+
String u = ExtractingParseObserver.trimDataUrl(url[0]);
555+
assertEquals("Entity " + url[0] + " not properly trimmed", url[1], u);
556+
}
557+
}
534558
}

0 commit comments

Comments
 (0)