Skip to content

Commit da324f9

Browse files
Merge pull request #37 from commoncrawl/ia-web-commons-36-title-embedded-svg
WAT extractor: do not extract page title from embedded SVG images, fixes #36
2 parents ea6cafd + e36c876 commit da324f9

File tree

4 files changed

+118
-15
lines changed

4 files changed

+118
-15
lines changed

src/main/java/org/archive/resource/html/ExtractingParseObserver.java

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,10 @@ public class ExtractingParseObserver implements ParseObserver {
2424
Stack<StringBuilder> openAnchorTexts;
2525
StringBuilder textExtract;
2626
String title = null;
27+
boolean inHead = false;
2728
boolean inTitle = false;
2829
boolean inPre = false;
30+
boolean inSVG = false;
2931

3032
protected static String cssUrlPatString =
3133
"url\\s*\\(\\s*([^)\\s]{1,8000}?)\\s*\\)";
@@ -59,7 +61,7 @@ public class ExtractingParseObserver implements ParseObserver {
5961
"button", "canvas", "caption", "col", "colgroup", "dd", "div", "dl", "dt", "embed", "fieldset",
6062
"figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hgroup", "hr",
6163
"li", "map", "noscript", "object", "ol", "output", "p", "pre", "progress", "section", "table", "tbody",
62-
"textarea", "tfoot", "th", "thead", "tr", "ul", "video" };
64+
"textarea", "tfoot", "th", "thead", "title", "tr", "ul", "video" };
6365
private static final Set<String> blockElements;
6466
/* inline elements which content is not melted with surrounding words */
6567
private final static String[] INLINE_ELEMENTS_SPACING = { "address", "cite", "details", "datalist", "iframe", "img",
@@ -144,11 +146,17 @@ public void handleTagEmpty(TagNode tag) {
144146
@Override
145147
public void handleTagOpen(TagNode tag) {
146148
String name = tag.getTagName();
147-
if(name.equals("TITLE")) {
149+
if (name.equals("HEAD")) {
150+
inHead = true;
151+
} else if (name.equals("TITLE")) {
148152
inTitle = !tag.isEmptyXmlTag();
149153
return;
150154
} else if (name.equals("PRE")) {
151155
inPre = true;
156+
} else if (name.equals("SVG")) {
157+
inSVG = true;
158+
} else if (name.equals("BODY")) {
159+
inHead = false;
152160
}
153161

154162
if (blockElements.contains(name)) {
@@ -183,9 +191,11 @@ public void handleTagOpen(TagNode tag) {
183191
public void handleTagClose(TagNode tag) {
184192
String name = tag.getTagName();
185193

186-
if(inTitle) {
194+
if (inTitle) {
187195
inTitle = false;
188-
data.setTitle(title);
196+
if (!inSVG && (inHead || !data.hasTitle())) {
197+
data.setTitle(title);
198+
}
189199
title = null;
190200
}
191201

@@ -222,8 +232,12 @@ public void handleTagClose(TagNode tag) {
222232
data.addHref(vals);
223233
}
224234
}
235+
} else if (tag.getTagName().equals("HEAD")) {
236+
inHead = false;
225237
} else if (tag.getTagName().equals("PRE")) {
226238
inPre = false;
239+
} else if (tag.getTagName().equals("SVG")) {
240+
inSVG = false;
227241
}
228242
}
229243

src/main/java/org/archive/resource/html/HTMLMetaData.java

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,15 @@ private JSONObject getHeader() {
3131
public void setBaseHref(String href) {
3232
putUnlessNull(getHeader(),HTML_BASE, href);
3333
}
34+
3435
public void setTitle(String title) {
3536
putUnlessNull(getHeader(),HTML_TITLE, title);
3637
}
38+
39+
public boolean hasTitle() {
40+
return header != null && header.has(HTML_TITLE);
41+
}
42+
3743
private void putUnlessNull(JSONObject o, String k, String v) {
3844
if(o != null) {
3945
try {
@@ -43,6 +49,7 @@ private void putUnlessNull(JSONObject o, String k, String v) {
4349
}
4450
}
4551
}
52+
4653
public String[] LtoA(List<String> l) {
4754
String[] a = new String[l.size()];
4855
l.toArray(a);

src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java

Lines changed: 48 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,19 @@ private void checkAnchor(Multimap<String,String> anchors, String url, String anc
152152
assertTrue("Wrong anchor text " + anchor + " for " + url, anchors.get(url).contains(anchor));
153153
}
154154

155+
private void checkTitle(Resource resource, String title) {
156+
assertNotNull(resource);
157+
assertTrue("Wrong instance type of Resource: " + resource.getClass(), resource instanceof HTMLResource);
158+
JSONObject head = resource.getMetaData().optJSONObject("Head");
159+
if (title != null) {
160+
assertNotNull(head);
161+
assertTrue("No title found", head.has(ResourceConstants.HTML_TITLE));
162+
assertEquals(title, head.get(ResourceConstants.HTML_TITLE));
163+
} else {
164+
assertFalse(head.has(ResourceConstants.HTML_TITLE));
165+
}
166+
}
167+
155168
private void checkLinks(Resource resource, String[][] expectedLinks) {
156169
assertNotNull(resource);
157170
assertTrue("Wrong instance type of Resource: " + resource.getClass(), resource instanceof HTMLResource);
@@ -247,7 +260,9 @@ public void testLinkExtraction() throws ResourceParseException, IOException {
247260
{"http://www.example.com/shakespeare.html", "Q@/cite"},
248261
{"http://www.example.com/shakespeare-long.html", "BLOCKQUOTE@/cite"}
249262
};
250-
checkLinks(extractor.getNext(), html4links);
263+
Resource resource = extractor.getNext();
264+
checkTitle(resource, "Test XHTML Link Extraction");
265+
checkLinks(resource, html4links);
251266
String[][] html5links = {
252267
{"http:///www.example.com/video.html", "LINK@/href", null, "canonical"},
253268
{"video.rss", "LINK@/href", null, "alternate"},
@@ -256,18 +271,24 @@ public void testLinkExtraction() throws ResourceParseException, IOException {
256271
{"https://archive.org/download/WebmVp8Vorbis/webmvp8_512kb.mp4", "SOURCE@/src"},
257272
{"https://archive.org/download/WebmVp8Vorbis/webmvp8.ogv", "SOURCE@/src"}
258273
};
259-
checkLinks(extractor.getNext(), html5links);
274+
resource = extractor.getNext();
275+
checkTitle(resource, "Test HTML5 Video Tag");
276+
checkLinks(resource, html5links);
260277
String[][] html5links2 = {
261278
{"http://www.example.com/", "A@/href"},
262279
};
263-
checkLinks(extractor.getNext(), html5links2);
280+
resource = extractor.getNext();
281+
checkTitle(resource, "Testing poor HTML5");
282+
checkLinks(resource, html5links2);
264283
String[][] fbVideoLinks = {
265284
{"https://www.facebook.com/facebook/videos/10153231379946729/", "BLOCKQUOTE@/cite"},
266285
{"https://www.facebook.com/facebook/videos/10153231379946729/", "A@/href"},
267286
{"https://www.facebook.com/facebook/", "A@/href"},
268287
{"https://www.facebook.com/facebook/videos/10153231379946729/", "DIV@/data-href"}
269288
};
270-
checkLinks(extractor.getNext(), fbVideoLinks);
289+
resource = extractor.getNext();
290+
checkTitle(resource, "fb-video - Embedded Videos - Social Plugins");
291+
checkLinks(resource, fbVideoLinks);
271292
String[][] dataHrefLinks = {
272293
{"standard.css", "LINK@/href", null, "stylesheet"},
273294
{"https://www.facebook.com/elegantthemes/videos/10153760379211923/", "DIV@/data-href"},
@@ -293,7 +314,9 @@ public void testLinkExtraction() throws ResourceParseException, IOException {
293314
{"#", "A@/href"},
294315
{"http://www.youtube.com/v/itTskyFLSS8&rel=0&autohide=1&showinfo=0", "IFRAME@/src"}
295316
};
296-
checkLinks(extractor.getNext(), dataHrefLinks);
317+
resource = extractor.getNext();
318+
checkTitle(resource, null); // empty title!
319+
checkLinks(resource, dataHrefLinks);
297320
String[][] fbSocialLinks = {
298321
{"http://www.your-domain.com/your-page.html", "DIV@/data-uri"},
299322
{"https://developers.facebook.com/docs/plugins/comments#configurator", "DIV@/data-href"},
@@ -305,7 +328,9 @@ public void testLinkExtraction() throws ResourceParseException, IOException {
305328
{"https://www.facebook.com/facebook", "A@/href"},
306329
{"http://www.your-domain.com/your-page.html", "DIV@/data-href"}
307330
};
308-
checkLinks(extractor.getNext(), fbSocialLinks);
331+
resource = extractor.getNext();
332+
// fragment without head and no title
333+
checkLinks(resource, fbSocialLinks);
309334
String[][] onClickLinks = {
310335
{"webpage.html", "DIV@/onclick"},
311336
{"index.html", "INPUT@/onclick"},
@@ -315,7 +340,9 @@ public void testLinkExtraction() throws ResourceParseException, IOException {
315340
{"http://example.com/location/href/1.html", "INPUT@/onclick"},
316341
{"http://example.com/location/href/2.html", "INPUT@/onclick"}
317342
};
318-
checkLinks(extractor.getNext(), onClickLinks);
343+
resource = extractor.getNext();
344+
checkTitle(resource, "Test Extraction of URLs from INPUT onClick Attributes");
345+
checkLinks(resource, onClickLinks);
319346
String[][] escapedEntitiesLinks = {
320347
{"http://www.example.com/", "__base__"},
321348
{"http://www.example.com/redirected.html", "__meta_refresh__"},
@@ -325,12 +352,11 @@ public void testLinkExtraction() throws ResourceParseException, IOException {
325352
{"https://img.example.org/view?id=867&res=10x16", "IMG@/src",
326353
"image URL containing escaped ampersand (\"&amp;\")" }
327354
};
328-
Resource resource = extractor.getNext();
355+
resource = extractor.getNext();
329356
assertNotNull(resource);
357+
checkTitle(resource, "Title – \"Title\" written using character entities");
330358
checkLinks(resource, escapedEntitiesLinks);
331359
MetaData md = resource.getMetaData();
332-
assertEquals("Wrong title", "Title – \"Title\" written using character entities",
333-
md.getJSONObject(ResourceConstants.HTML_HEAD).getString(ResourceConstants.HTML_TITLE));
334360
JSONArray metas = md.getJSONObject(ResourceConstants.HTML_HEAD).getJSONArray(ResourceConstants.HTML_META_TAGS);
335361
for (int i = 0; i < metas.length(); i++) {
336362
JSONObject o = metas.optJSONObject(i);
@@ -344,7 +370,7 @@ public void testLinkExtraction() throws ResourceParseException, IOException {
344370
"Anchor text with white space character entities and HTML block elements" } };
345371
resource = extractor.getNext();
346372
assertNotNull(resource);
347-
System.out.println(resource);
373+
checkTitle(resource, "Test Anchor Text Extraction With Whitespace");
348374
checkLinks(resource, exampleLinks);
349375
}
350376

@@ -357,6 +383,7 @@ public void testTextExtraction() throws ResourceParseException, IOException {
357383
Resource resource = extractor.getNext();
358384
assertNotNull(resource);
359385
assertTrue("Wrong instance type of Resource: " + resource.getClass(), resource instanceof HTMLResource);
386+
checkTitle(resource, "White space and paragraph breaks when converting HTML to text");
360387
String text = resource.getMetaData().getString(ResourceConstants.HTML_TEXT);
361388
System.out.println(text);
362389
assertTrue(text.contains("text\nThere should be a paragraph break after <h1-h6>"));
@@ -377,6 +404,16 @@ public void testTextExtraction() throws ResourceParseException, IOException {
377404
// assertTrue(text.matches("CDATA in MathML:\\W*x<y"));
378405
}
379406

407+
public void testTitleExtraction() throws ResourceParseException, IOException {
408+
String testFileName = "title-extraction-embedded-SVG.warc";
409+
ResourceProducer producer = ProducerUtils.getProducer(getClass().getResource(testFileName).getPath());
410+
ResourceFactoryMapper mapper = new ExtractingResourceFactoryMapper();
411+
ExtractingResourceProducer extractor =
412+
new ExtractingResourceProducer(producer, mapper);
413+
Resource resource = extractor.getNext();
414+
checkTitle(resource, "Testing title extraction with embedded SVG");
415+
}
416+
380417
public void testHtmlParserEntityDecoding() {
381418
String[][] entities = { //
382419
/* ampersand */
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
WARC/1.0
2+
WARC-Type: response
3+
WARC-Record-ID: <urn:uuid:9043ba74-5d11-4dad-97c1-d7454f8b7358>
4+
WARC-Target-URI: https://www.example.org/testEmbeddedSVG.html
5+
WARC-Date: 2024-10-14T10:05:41Z
6+
WARC-IP-Address: 127.0.0.1
7+
WARC-Block-Digest: sha1:XNN4JA3QDUN4DDEGTIPH5ZRORHYL657F
8+
WARC-Payload-Digest: sha1:4FUACFTG3WCL26OITZNMEPRKFP6WAAHN
9+
Content-Type: application/http;msgtype=response
10+
Content-Length: 856
11+
12+
HTTP/1.1 200 OK
13+
Date: Mon, 14 Oct 2024 10:05:41 GMT
14+
Server: Apache/2.4.58 (Ubuntu)
15+
Upgrade: h2,h2c
16+
Connection: Upgrade, Keep-Alive
17+
Last-Modified: Mon, 14 Oct 2024 10:04:25 GMT
18+
ETag: "20a-6246cf6287f50"
19+
Accept-Ranges: bytes
20+
Content-Length: 522
21+
Vary: Accept-Encoding
22+
Keep-Alive: timeout=5, max=100
23+
Content-Type: text/html
24+
25+
<!DOCTYPE html>
26+
<html>
27+
<head>
28+
<title>Testing title extraction with embedded SVG</title>
29+
<meta charset="utf-8">
30+
</head>
31+
<body>
32+
<div>
33+
<header>Testing title extraction with embedded SVG</header>
34+
<p>This is body text...</p>
35+
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 400 400" fill="currentColor" width="1em">
36+
<title>Embedded SVG</title>
37+
<rect x="0" y="0" width="100%" height="100%" fill="lightblue"/>
38+
<circle cx="100" cy="100" r="50" fill="red"/>
39+
</svg>
40+
</div>
41+
</body>
42+
</html>
43+
44+
45+

0 commit comments

Comments
 (0)