Skip to content

Commit fc11441

Browse files
WAT extractor: do not extract page title from embedded SVG images
- add unit test that correct title is extracted from a document which includes an embedded SVG image containing a title element - extend existing unit tests to test for proper title extraction
1 parent ea6cafd commit fc11441

File tree

2 files changed

+93
-11
lines changed

2 files changed

+93
-11
lines changed

src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java

Lines changed: 48 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,19 @@ private void checkAnchor(Multimap<String,String> anchors, String url, String anc
152152
assertTrue("Wrong anchor text " + anchor + " for " + url, anchors.get(url).contains(anchor));
153153
}
154154

155+
private void checkTitle(Resource resource, String title) {
156+
assertNotNull(resource);
157+
assertTrue("Wrong instance type of Resource: " + resource.getClass(), resource instanceof HTMLResource);
158+
JSONObject head = resource.getMetaData().optJSONObject("Head");
159+
if (title != null) {
160+
assertNotNull(head);
161+
assertTrue("No title found", head.has(ResourceConstants.HTML_TITLE));
162+
assertEquals(title, head.get(ResourceConstants.HTML_TITLE));
163+
} else {
164+
assertFalse(head.has(ResourceConstants.HTML_TITLE));
165+
}
166+
}
167+
155168
private void checkLinks(Resource resource, String[][] expectedLinks) {
156169
assertNotNull(resource);
157170
assertTrue("Wrong instance type of Resource: " + resource.getClass(), resource instanceof HTMLResource);
@@ -247,7 +260,9 @@ public void testLinkExtraction() throws ResourceParseException, IOException {
247260
{"http://www.example.com/shakespeare.html", "Q@/cite"},
248261
{"http://www.example.com/shakespeare-long.html", "BLOCKQUOTE@/cite"}
249262
};
250-
checkLinks(extractor.getNext(), html4links);
263+
Resource resource = extractor.getNext();
264+
checkTitle(resource, "Test XHTML Link Extraction");
265+
checkLinks(resource, html4links);
251266
String[][] html5links = {
252267
{"http:///www.example.com/video.html", "LINK@/href", null, "canonical"},
253268
{"video.rss", "LINK@/href", null, "alternate"},
@@ -256,18 +271,24 @@ public void testLinkExtraction() throws ResourceParseException, IOException {
256271
{"https://archive.org/download/WebmVp8Vorbis/webmvp8_512kb.mp4", "SOURCE@/src"},
257272
{"https://archive.org/download/WebmVp8Vorbis/webmvp8.ogv", "SOURCE@/src"}
258273
};
259-
checkLinks(extractor.getNext(), html5links);
274+
resource = extractor.getNext();
275+
checkTitle(resource, "Test HTML5 Video Tag");
276+
checkLinks(resource, html5links);
260277
String[][] html5links2 = {
261278
{"http://www.example.com/", "A@/href"},
262279
};
263-
checkLinks(extractor.getNext(), html5links2);
280+
resource = extractor.getNext();
281+
checkTitle(resource, "Testing poor HTML5");
282+
checkLinks(resource, html5links2);
264283
String[][] fbVideoLinks = {
265284
{"https://www.facebook.com/facebook/videos/10153231379946729/", "BLOCKQUOTE@/cite"},
266285
{"https://www.facebook.com/facebook/videos/10153231379946729/", "A@/href"},
267286
{"https://www.facebook.com/facebook/", "A@/href"},
268287
{"https://www.facebook.com/facebook/videos/10153231379946729/", "DIV@/data-href"}
269288
};
270-
checkLinks(extractor.getNext(), fbVideoLinks);
289+
resource = extractor.getNext();
290+
checkTitle(resource, "fb-video - Embedded Videos - Social Plugins");
291+
checkLinks(resource, fbVideoLinks);
271292
String[][] dataHrefLinks = {
272293
{"standard.css", "LINK@/href", null, "stylesheet"},
273294
{"https://www.facebook.com/elegantthemes/videos/10153760379211923/", "DIV@/data-href"},
@@ -293,7 +314,9 @@ public void testLinkExtraction() throws ResourceParseException, IOException {
293314
{"#", "A@/href"},
294315
{"http://www.youtube.com/v/itTskyFLSS8&rel=0&autohide=1&showinfo=0", "IFRAME@/src"}
295316
};
296-
checkLinks(extractor.getNext(), dataHrefLinks);
317+
resource = extractor.getNext();
318+
checkTitle(resource, null); // empty title!
319+
checkLinks(resource, dataHrefLinks);
297320
String[][] fbSocialLinks = {
298321
{"http://www.your-domain.com/your-page.html", "DIV@/data-uri"},
299322
{"https://developers.facebook.com/docs/plugins/comments#configurator", "DIV@/data-href"},
@@ -305,7 +328,9 @@ public void testLinkExtraction() throws ResourceParseException, IOException {
305328
{"https://www.facebook.com/facebook", "A@/href"},
306329
{"http://www.your-domain.com/your-page.html", "DIV@/data-href"}
307330
};
308-
checkLinks(extractor.getNext(), fbSocialLinks);
331+
resource = extractor.getNext();
332+
// fragment without head and no title
333+
checkLinks(resource, fbSocialLinks);
309334
String[][] onClickLinks = {
310335
{"webpage.html", "DIV@/onclick"},
311336
{"index.html", "INPUT@/onclick"},
@@ -315,7 +340,9 @@ public void testLinkExtraction() throws ResourceParseException, IOException {
315340
{"http://example.com/location/href/1.html", "INPUT@/onclick"},
316341
{"http://example.com/location/href/2.html", "INPUT@/onclick"}
317342
};
318-
checkLinks(extractor.getNext(), onClickLinks);
343+
resource = extractor.getNext();
344+
checkTitle(resource, "Test Extraction of URLs from INPUT onClick Attributes");
345+
checkLinks(resource, onClickLinks);
319346
String[][] escapedEntitiesLinks = {
320347
{"http://www.example.com/", "__base__"},
321348
{"http://www.example.com/redirected.html", "__meta_refresh__"},
@@ -325,12 +352,11 @@ public void testLinkExtraction() throws ResourceParseException, IOException {
325352
{"https://img.example.org/view?id=867&res=10x16", "IMG@/src",
326353
"image URL containing escaped ampersand (\"&amp;\")" }
327354
};
328-
Resource resource = extractor.getNext();
355+
resource = extractor.getNext();
329356
assertNotNull(resource);
357+
checkTitle(resource, "Title – \"Title\" written using character entities");
330358
checkLinks(resource, escapedEntitiesLinks);
331359
MetaData md = resource.getMetaData();
332-
assertEquals("Wrong title", "Title – \"Title\" written using character entities",
333-
md.getJSONObject(ResourceConstants.HTML_HEAD).getString(ResourceConstants.HTML_TITLE));
334360
JSONArray metas = md.getJSONObject(ResourceConstants.HTML_HEAD).getJSONArray(ResourceConstants.HTML_META_TAGS);
335361
for (int i = 0; i < metas.length(); i++) {
336362
JSONObject o = metas.optJSONObject(i);
@@ -344,7 +370,7 @@ public void testLinkExtraction() throws ResourceParseException, IOException {
344370
"Anchor text with white space character entities and HTML block elements" } };
345371
resource = extractor.getNext();
346372
assertNotNull(resource);
347-
System.out.println(resource);
373+
checkTitle(resource, "Test Anchor Text Extraction With Whitespace");
348374
checkLinks(resource, exampleLinks);
349375
}
350376

@@ -357,6 +383,7 @@ public void testTextExtraction() throws ResourceParseException, IOException {
357383
Resource resource = extractor.getNext();
358384
assertNotNull(resource);
359385
assertTrue("Wrong instance type of Resource: " + resource.getClass(), resource instanceof HTMLResource);
386+
checkTitle(resource, "White space and paragraph breaks when converting HTML to text");
360387
String text = resource.getMetaData().getString(ResourceConstants.HTML_TEXT);
361388
System.out.println(text);
362389
assertTrue(text.contains("text\nThere should be a paragraph break after <h1-h6>"));
@@ -377,6 +404,16 @@ public void testTextExtraction() throws ResourceParseException, IOException {
377404
// assertTrue(text.matches("CDATA in MathML:\\W*x<y"));
378405
}
379406

407+
public void testTitleExtraction() throws ResourceParseException, IOException {
408+
String testFileName = "title-extraction-embedded-SVG.warc";
409+
ResourceProducer producer = ProducerUtils.getProducer(getClass().getResource(testFileName).getPath());
410+
ResourceFactoryMapper mapper = new ExtractingResourceFactoryMapper();
411+
ExtractingResourceProducer extractor =
412+
new ExtractingResourceProducer(producer, mapper);
413+
Resource resource = extractor.getNext();
414+
checkTitle(resource, "Testing title extraction with embedded SVG");
415+
}
416+
380417
public void testHtmlParserEntityDecoding() {
381418
String[][] entities = { //
382419
/* ampersand */
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
WARC/1.0
2+
WARC-Type: response
3+
WARC-Record-ID: <urn:uuid:9043ba74-5d11-4dad-97c1-d7454f8b7358>
4+
WARC-Target-URI: https://www.example.org/testEmbeddedSVG.html
5+
WARC-Date: 2024-10-14T10:05:41Z
6+
WARC-IP-Address: 127.0.0.1
7+
WARC-Block-Digest: sha1:XNN4JA3QDUN4DDEGTIPH5ZRORHYL657F
8+
WARC-Payload-Digest: sha1:4FUACFTG3WCL26OITZNMEPRKFP6WAAHN
9+
Content-Type: application/http;msgtype=response
10+
Content-Length: 856
11+
12+
HTTP/1.1 200 OK
13+
Date: Mon, 14 Oct 2024 10:05:41 GMT
14+
Server: Apache/2.4.58 (Ubuntu)
15+
Upgrade: h2,h2c
16+
Connection: Upgrade, Keep-Alive
17+
Last-Modified: Mon, 14 Oct 2024 10:04:25 GMT
18+
ETag: "20a-6246cf6287f50"
19+
Accept-Ranges: bytes
20+
Content-Length: 522
21+
Vary: Accept-Encoding
22+
Keep-Alive: timeout=5, max=100
23+
Content-Type: text/html
24+
25+
<!DOCTYPE html>
26+
<html>
27+
<head>
28+
<title>Testing title extraction with embedded SVG</title>
29+
<meta charset="utf-8">
30+
</head>
31+
<body>
32+
<div>
33+
<header>Testing title extraction with embedded SVG</header>
34+
<p>This is body text...</p>
35+
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 400 400" fill="currentColor" width="1em">
36+
<title>Embedded SVG</title>
37+
<rect x="0" y="0" width="100%" height="100%" fill="lightblue"/>
38+
<circle cx="100" cy="100" r="50" fill="red"/>
39+
</svg>
40+
</div>
41+
</body>
42+
</html>
43+
44+
45+

0 commit comments

Comments
 (0)