@@ -152,6 +152,19 @@ private void checkAnchor(Multimap<String,String> anchors, String url, String anc
152
152
assertTrue ("Wrong anchor text " + anchor + " for " + url , anchors .get (url ).contains (anchor ));
153
153
}
154
154
155
+ private void checkTitle (Resource resource , String title ) {
156
+ assertNotNull (resource );
157
+ assertTrue ("Wrong instance type of Resource: " + resource .getClass (), resource instanceof HTMLResource );
158
+ JSONObject head = resource .getMetaData ().optJSONObject ("Head" );
159
+ if (title != null ) {
160
+ assertNotNull (head );
161
+ assertTrue ("No title found" , head .has (ResourceConstants .HTML_TITLE ));
162
+ assertEquals (title , head .get (ResourceConstants .HTML_TITLE ));
163
+ } else {
164
+ assertFalse (head .has (ResourceConstants .HTML_TITLE ));
165
+ }
166
+ }
167
+
155
168
private void checkLinks (Resource resource , String [][] expectedLinks ) {
156
169
assertNotNull (resource );
157
170
assertTrue ("Wrong instance type of Resource: " + resource .getClass (), resource instanceof HTMLResource );
@@ -247,7 +260,9 @@ public void testLinkExtraction() throws ResourceParseException, IOException {
247
260
{"http://www.example.com/shakespeare.html" , "Q@/cite" },
248
261
{"http://www.example.com/shakespeare-long.html" , "BLOCKQUOTE@/cite" }
249
262
};
250
- checkLinks (extractor .getNext (), html4links );
263
+ Resource resource = extractor .getNext ();
264
+ checkTitle (resource , "Test XHTML Link Extraction" );
265
+ checkLinks (resource , html4links );
251
266
String [][] html5links = {
252
267
{"http:///www.example.com/video.html" , "LINK@/href" , null , "canonical" },
253
268
{"video.rss" , "LINK@/href" , null , "alternate" },
@@ -256,18 +271,24 @@ public void testLinkExtraction() throws ResourceParseException, IOException {
256
271
{"https://archive.org/download/WebmVp8Vorbis/webmvp8_512kb.mp4" , "SOURCE@/src" },
257
272
{"https://archive.org/download/WebmVp8Vorbis/webmvp8.ogv" , "SOURCE@/src" }
258
273
};
259
- checkLinks (extractor .getNext (), html5links );
274
+ resource = extractor .getNext ();
275
+ checkTitle (resource , "Test HTML5 Video Tag" );
276
+ checkLinks (resource , html5links );
260
277
String [][] html5links2 = {
261
278
{"http://www.example.com/" , "A@/href" },
262
279
};
263
- checkLinks (extractor .getNext (), html5links2 );
280
+ resource = extractor .getNext ();
281
+ checkTitle (resource , "Testing poor HTML5" );
282
+ checkLinks (resource , html5links2 );
264
283
String [][] fbVideoLinks = {
265
284
{"https://www.facebook.com/facebook/videos/10153231379946729/" , "BLOCKQUOTE@/cite" },
266
285
{"https://www.facebook.com/facebook/videos/10153231379946729/" , "A@/href" },
267
286
{"https://www.facebook.com/facebook/" , "A@/href" },
268
287
{"https://www.facebook.com/facebook/videos/10153231379946729/" , "DIV@/data-href" }
269
288
};
270
- checkLinks (extractor .getNext (), fbVideoLinks );
289
+ resource = extractor .getNext ();
290
+ checkTitle (resource , "fb-video - Embedded Videos - Social Plugins" );
291
+ checkLinks (resource , fbVideoLinks );
271
292
String [][] dataHrefLinks = {
272
293
{"standard.css" , "LINK@/href" , null , "stylesheet" },
273
294
{"https://www.facebook.com/elegantthemes/videos/10153760379211923/" , "DIV@/data-href" },
@@ -293,7 +314,9 @@ public void testLinkExtraction() throws ResourceParseException, IOException {
293
314
{"#" , "A@/href" },
294
315
{"http://www.youtube.com/v/itTskyFLSS8&rel=0&autohide=1&showinfo=0" , "IFRAME@/src" }
295
316
};
296
- checkLinks (extractor .getNext (), dataHrefLinks );
317
+ resource = extractor .getNext ();
318
+ checkTitle (resource , null ); // empty title!
319
+ checkLinks (resource , dataHrefLinks );
297
320
String [][] fbSocialLinks = {
298
321
{"http://www.your-domain.com/your-page.html" , "DIV@/data-uri" },
299
322
{"https://developers.facebook.com/docs/plugins/comments#configurator" , "DIV@/data-href" },
@@ -305,7 +328,9 @@ public void testLinkExtraction() throws ResourceParseException, IOException {
305
328
{"https://www.facebook.com/facebook" , "A@/href" },
306
329
{"http://www.your-domain.com/your-page.html" , "DIV@/data-href" }
307
330
};
308
- checkLinks (extractor .getNext (), fbSocialLinks );
331
+ resource = extractor .getNext ();
332
+ // fragment without head and no title
333
+ checkLinks (resource , fbSocialLinks );
309
334
String [][] onClickLinks = {
310
335
{"webpage.html" , "DIV@/onclick" },
311
336
{"index.html" , "INPUT@/onclick" },
@@ -315,7 +340,9 @@ public void testLinkExtraction() throws ResourceParseException, IOException {
315
340
{"http://example.com/location/href/1.html" , "INPUT@/onclick" },
316
341
{"http://example.com/location/href/2.html" , "INPUT@/onclick" }
317
342
};
318
- checkLinks (extractor .getNext (), onClickLinks );
343
+ resource = extractor .getNext ();
344
+ checkTitle (resource , "Test Extraction of URLs from INPUT onClick Attributes" );
345
+ checkLinks (resource , onClickLinks );
319
346
String [][] escapedEntitiesLinks = {
320
347
{"http://www.example.com/" , "__base__" },
321
348
{"http://www.example.com/redirected.html" , "__meta_refresh__" },
@@ -325,12 +352,11 @@ public void testLinkExtraction() throws ResourceParseException, IOException {
325
352
{"https://img.example.org/view?id=867&res=10x16" , "IMG@/src" ,
326
353
"image URL containing escaped ampersand (\" &\" )" }
327
354
};
328
- Resource resource = extractor .getNext ();
355
+ resource = extractor .getNext ();
329
356
assertNotNull (resource );
357
+ checkTitle (resource , "Title – \" Title\" written using character entities" );
330
358
checkLinks (resource , escapedEntitiesLinks );
331
359
MetaData md = resource .getMetaData ();
332
- assertEquals ("Wrong title" , "Title – \" Title\" written using character entities" ,
333
- md .getJSONObject (ResourceConstants .HTML_HEAD ).getString (ResourceConstants .HTML_TITLE ));
334
360
JSONArray metas = md .getJSONObject (ResourceConstants .HTML_HEAD ).getJSONArray (ResourceConstants .HTML_META_TAGS );
335
361
for (int i = 0 ; i < metas .length (); i ++) {
336
362
JSONObject o = metas .optJSONObject (i );
@@ -344,7 +370,7 @@ public void testLinkExtraction() throws ResourceParseException, IOException {
344
370
"Anchor text with white space character entities and HTML block elements" } };
345
371
resource = extractor .getNext ();
346
372
assertNotNull (resource );
347
- System . out . println (resource );
373
+ checkTitle (resource , "Test Anchor Text Extraction With Whitespace" );
348
374
checkLinks (resource , exampleLinks );
349
375
}
350
376
@@ -357,6 +383,7 @@ public void testTextExtraction() throws ResourceParseException, IOException {
357
383
Resource resource = extractor .getNext ();
358
384
assertNotNull (resource );
359
385
assertTrue ("Wrong instance type of Resource: " + resource .getClass (), resource instanceof HTMLResource );
386
+ checkTitle (resource , "White space and paragraph breaks when converting HTML to text" );
360
387
String text = resource .getMetaData ().getString (ResourceConstants .HTML_TEXT );
361
388
System .out .println (text );
362
389
assertTrue (text .contains ("text\n There should be a paragraph break after <h1-h6>" ));
@@ -377,6 +404,16 @@ public void testTextExtraction() throws ResourceParseException, IOException {
377
404
// assertTrue(text.matches("CDATA in MathML:\\W*x<y"));
378
405
}
379
406
407
+ public void testTitleExtraction () throws ResourceParseException , IOException {
408
+ String testFileName = "title-extraction-embedded-SVG.warc" ;
409
+ ResourceProducer producer = ProducerUtils .getProducer (getClass ().getResource (testFileName ).getPath ());
410
+ ResourceFactoryMapper mapper = new ExtractingResourceFactoryMapper ();
411
+ ExtractingResourceProducer extractor =
412
+ new ExtractingResourceProducer (producer , mapper );
413
+ Resource resource = extractor .getNext ();
414
+ checkTitle (resource , "Testing title extraction with embedded SVG" );
415
+ }
416
+
380
417
public void testHtmlParserEntityDecoding () {
381
418
String [][] entities = { //
382
419
/* ampersand */
0 commit comments