@@ -166,17 +166,21 @@ private void checkTitle(Resource resource, String title) {
166166 }
167167 }
168168
169- private void checkExtractedAttributes (Resource resource , String ... attributes ) throws JSONException {
169+ private void checkExtractedAttributes (Resource resource , int metaElements , int metaElementIndex ,
170+ String ... attributes ) throws JSONException {
170171 assertNotNull (resource );
171172 assertTrue ("Wrong instance type of Resource: " + resource .getClass (), resource instanceof HTMLResource );
172173 JSONArray metas = resource .getMetaData ().getJSONObject ("Head" ).getJSONArray ("Metas" );
173174 assertNotNull (metas );
174- JSONObject meta = metas .getJSONObject (0 );
175+ if (metaElements > -1 ) {
176+ assertEquals (metaElements , metas .length ());
177+ }
178+ JSONObject meta = metas .getJSONObject (metaElementIndex );
175179 assertEquals (attributes .length / 2 , meta .length ());
176180 for (int i = 0 ; i < attributes .length ; i += 2 ) {
177181 String key = attributes [i ];
178182 assertNotNull (meta .get (key ));
179- assertEquals (meta . get ( key ), attributes [i + 1 ]);
183+ assertEquals (attributes [i + 1 ], meta . get ( key ) );
180184 }
181185 }
182186
@@ -255,20 +259,6 @@ private void checkLinks(Resource resource, String[][] expectedLinks) {
255259 }
256260 }
257261
258- private void checkExtractHtmlLangAttribute (Resource resource , String ... langAttributes )
259- throws JSONException {
260- assertNotNull (resource );
261- assertTrue ("Wrong instance type of Resource: " + resource .getClass (), resource instanceof HTMLResource );
262- JSONArray metas = resource .getMetaData ().getJSONObject ("Head" ).getJSONArray ("Metas" );
263- assertNotNull (metas );
264- JSONObject meta = metas .getJSONObject (0 );
265- for (int i = 0 ; i < langAttributes .length ; i += 2 ) {
266- String key = langAttributes [i ];
267- assertNotNull (meta .get (key ));
268- assertEquals (meta .get (key ), langAttributes [i +1 ]);
269- }
270- }
271-
272262 public void testLinkExtraction () throws ResourceParseException , IOException {
273263 String testFileName = "link-extraction-test.warc" ;
274264 ResourceProducer producer = ProducerUtils .getProducer (getClass ().getResource (testFileName ).getPath ());
@@ -448,19 +438,21 @@ public void testHtmlLanguageAttributeExtraction() throws ResourceParseException,
448438 ResourceProducer producer = ProducerUtils .getProducer (getClass ().getResource (testFileName ).getPath ());
449439 ResourceFactoryMapper mapper = new ExtractingResourceFactoryMapper ();
450440 ExtractingResourceProducer extractor = new ExtractingResourceProducer (producer , mapper );
451- checkExtractHtmlLangAttribute (extractor .getNext (), "name" , "HTML@/lang" , "content" , "en" );
452- checkExtractHtmlLangAttribute (extractor .getNext (), "name" , "HTML@/lang" , "content" , "zh-CN" );
453- checkExtractHtmlLangAttribute (extractor .getNext (), "name" , "HTML@/lang" , "content" , "cs-cz" );
454- checkExtractHtmlLangAttribute (extractor .getNext (), "name" , "HTML@/lang" , "content" , "en" );
455- checkExtractHtmlLangAttribute (extractor .getNext (), "name" , "HTML@/xml:lang" , "content" , "es-MX" );
441+ checkExtractedAttributes (extractor .getNext (), 1 , 0 , "name" , "HTML@/lang" , "content" , "en" );
442+ checkExtractedAttributes (extractor .getNext (), 1 , 0 , "name" , "HTML@/lang" , "content" , "zh-CN" );
443+ checkExtractedAttributes (extractor .getNext (), 1 , 0 , "name" , "HTML@/lang" , "content" , "cs-cz" );
444+ checkExtractedAttributes (extractor .getNext (), 2 , 0 , "name" , "HTML@/lang" , "content" , "en" );
445+ checkExtractedAttributes (extractor .getNext (), 1 , 0 , "name" , "HTML@/xml:lang" , "content" , "es-MX" );
456446 }
457447
458448 public void testBodyMetaElements () throws ResourceParseException , IOException {
459449 String testFileName = "meta-itemprop.warc" ;
460450 ResourceProducer producer = ProducerUtils .getProducer (getClass ().getResource (testFileName ).getPath ());
461451 ResourceFactoryMapper mapper = new ExtractingResourceFactoryMapper ();
462452 ExtractingResourceProducer extractor = new ExtractingResourceProducer (producer , mapper );
463- checkExtractedAttributes (extractor .getNext (), "name" , "robots" , "content" , "index,follow" );
453+ Resource resource = extractor .getNext ();
454+ checkExtractedAttributes (resource , 2 , 0 , "name" , "HTML@/lang" , "content" , "en" );
455+ checkExtractedAttributes (resource , 2 , 1 , "name" , "robots" , "content" , "index,follow" );
464456 }
465457
466458 public void testHtmlParserEntityDecoding () {
0 commit comments