@@ -166,6 +166,24 @@ private void checkTitle(Resource resource, String title) {
166166 }
167167 }
168168
169+ private void checkExtractedAttributes (Resource resource , int metaElements , int metaElementIndex ,
170+ String ... attributes ) throws JSONException {
171+ assertNotNull (resource );
172+ assertTrue ("Wrong instance type of Resource: " + resource .getClass (), resource instanceof HTMLResource );
173+ JSONArray metas = resource .getMetaData ().getJSONObject ("Head" ).getJSONArray ("Metas" );
174+ assertNotNull (metas );
175+ if (metaElements > -1 ) {
176+ assertEquals (metaElements , metas .length ());
177+ }
178+ JSONObject meta = metas .getJSONObject (metaElementIndex );
179+ assertEquals (attributes .length / 2 , meta .length ());
180+ for (int i = 0 ; i < attributes .length ; i += 2 ) {
181+ String key = attributes [i ];
182+ assertNotNull (meta .get (key ));
183+ assertEquals (attributes [i + 1 ], meta .get (key ));
184+ }
185+ }
186+
169187 private void checkLinks (Resource resource , String [][] expectedLinks ) {
170188 assertNotNull (resource );
171189 assertTrue ("Wrong instance type of Resource: " + resource .getClass (), resource instanceof HTMLResource );
@@ -241,20 +259,6 @@ private void checkLinks(Resource resource, String[][] expectedLinks) {
241259 }
242260 }
243261
244- private void checkExtractHtmlLangAttribute (Resource resource , String ... langAttributes )
245- throws JSONException {
246- assertNotNull (resource );
247- assertTrue ("Wrong instance type of Resource: " + resource .getClass (), resource instanceof HTMLResource );
248- JSONArray metas = resource .getMetaData ().getJSONObject ("Head" ).getJSONArray ("Metas" );
249- assertNotNull (metas );
250- JSONObject meta = metas .getJSONObject (0 );
251- for (int i = 0 ; i < langAttributes .length ; i += 2 ) {
252- String key = langAttributes [i ];
253- assertNotNull (meta .get (key ));
254- assertEquals (meta .get (key ), langAttributes [i +1 ]);
255- }
256- }
257-
258262 public void testLinkExtraction () throws ResourceParseException , IOException {
259263 String testFileName = "link-extraction-test.warc" ;
260264 ResourceProducer producer = ProducerUtils .getProducer (getClass ().getResource (testFileName ).getPath ());
@@ -434,11 +438,21 @@ public void testHtmlLanguageAttributeExtraction() throws ResourceParseException,
434438 ResourceProducer producer = ProducerUtils .getProducer (getClass ().getResource (testFileName ).getPath ());
435439 ResourceFactoryMapper mapper = new ExtractingResourceFactoryMapper ();
436440 ExtractingResourceProducer extractor = new ExtractingResourceProducer (producer , mapper );
437- checkExtractHtmlLangAttribute (extractor .getNext (), "name" , "HTML@/lang" , "content" , "en" );
438- checkExtractHtmlLangAttribute (extractor .getNext (), "name" , "HTML@/lang" , "content" , "zh-CN" );
439- checkExtractHtmlLangAttribute (extractor .getNext (), "name" , "HTML@/lang" , "content" , "cs-cz" );
440- checkExtractHtmlLangAttribute (extractor .getNext (), "name" , "HTML@/lang" , "content" , "en" );
441- checkExtractHtmlLangAttribute (extractor .getNext (), "name" , "HTML@/xml:lang" , "content" , "es-MX" );
441+ checkExtractedAttributes (extractor .getNext (), 1 , 0 , "name" , "HTML@/lang" , "content" , "en" );
442+ checkExtractedAttributes (extractor .getNext (), 1 , 0 , "name" , "HTML@/lang" , "content" , "zh-CN" );
443+ checkExtractedAttributes (extractor .getNext (), 1 , 0 , "name" , "HTML@/lang" , "content" , "cs-cz" );
444+ checkExtractedAttributes (extractor .getNext (), 2 , 0 , "name" , "HTML@/lang" , "content" , "en" );
445+ checkExtractedAttributes (extractor .getNext (), 1 , 0 , "name" , "HTML@/xml:lang" , "content" , "es-MX" );
446+ }
447+
448+ public void testBodyMetaElements () throws ResourceParseException , IOException {
449+ String testFileName = "meta-itemprop.warc" ;
450+ ResourceProducer producer = ProducerUtils .getProducer (getClass ().getResource (testFileName ).getPath ());
451+ ResourceFactoryMapper mapper = new ExtractingResourceFactoryMapper ();
452+ ExtractingResourceProducer extractor = new ExtractingResourceProducer (producer , mapper );
453+ Resource resource = extractor .getNext ();
454+ checkExtractedAttributes (resource , 2 , 0 , "name" , "HTML@/lang" , "content" , "en" );
455+ checkExtractedAttributes (resource , 2 , 1 , "name" , "robots" , "content" , "index,follow" );
442456 }
443457
444458 public void testHtmlParserEntityDecoding () {
0 commit comments