Skip to content

Commit b474f5d

Browse files
WAT extractor: do not add <meta itemprop="..." > from body as metadata
- rebase to recent head / master - unit test: merge methods to verify any kind of metadata attributes
1 parent febb13f commit b474f5d

File tree

1 file changed

+15
-23
lines changed

1 file changed

+15
-23
lines changed

src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java

Lines changed: 15 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -166,17 +166,21 @@ private void checkTitle(Resource resource, String title) {
166166
}
167167
}
168168

169-
private void checkExtractedAttributes(Resource resource, String... attributes) throws JSONException {
169+
private void checkExtractedAttributes(Resource resource, int metaElements, int metaElementIndex,
170+
String... attributes) throws JSONException {
170171
assertNotNull(resource);
171172
assertTrue("Wrong instance type of Resource: " + resource.getClass(), resource instanceof HTMLResource);
172173
JSONArray metas = resource.getMetaData().getJSONObject("Head").getJSONArray("Metas");
173174
assertNotNull(metas);
174-
JSONObject meta = metas.getJSONObject(0);
175+
if (metaElements > -1) {
176+
assertEquals(metaElements, metas.length());
177+
}
178+
JSONObject meta = metas.getJSONObject(metaElementIndex);
175179
assertEquals(attributes.length / 2, meta.length());
176180
for (int i = 0; i < attributes.length; i += 2) {
177181
String key = attributes[i];
178182
assertNotNull(meta.get(key));
179-
assertEquals(meta.get(key), attributes[i + 1]);
183+
assertEquals(attributes[i + 1], meta.get(key));
180184
}
181185
}
182186

@@ -255,20 +259,6 @@ private void checkLinks(Resource resource, String[][] expectedLinks) {
255259
}
256260
}
257261

258-
private void checkExtractHtmlLangAttribute(Resource resource, String... langAttributes)
259-
throws JSONException {
260-
assertNotNull(resource);
261-
assertTrue("Wrong instance type of Resource: " + resource.getClass(), resource instanceof HTMLResource);
262-
JSONArray metas = resource.getMetaData().getJSONObject("Head").getJSONArray("Metas");
263-
assertNotNull(metas);
264-
JSONObject meta = metas.getJSONObject(0);
265-
for (int i = 0; i < langAttributes.length; i += 2) {
266-
String key = langAttributes[i];
267-
assertNotNull(meta.get(key));
268-
assertEquals(meta.get(key), langAttributes[i+1]);
269-
}
270-
}
271-
272262
public void testLinkExtraction() throws ResourceParseException, IOException {
273263
String testFileName = "link-extraction-test.warc";
274264
ResourceProducer producer = ProducerUtils.getProducer(getClass().getResource(testFileName).getPath());
@@ -448,19 +438,21 @@ public void testHtmlLanguageAttributeExtraction() throws ResourceParseException,
448438
ResourceProducer producer = ProducerUtils.getProducer(getClass().getResource(testFileName).getPath());
449439
ResourceFactoryMapper mapper = new ExtractingResourceFactoryMapper();
450440
ExtractingResourceProducer extractor = new ExtractingResourceProducer(producer, mapper);
451-
checkExtractHtmlLangAttribute(extractor.getNext(), "name", "HTML@/lang", "content", "en");
452-
checkExtractHtmlLangAttribute(extractor.getNext(), "name", "HTML@/lang", "content", "zh-CN");
453-
checkExtractHtmlLangAttribute(extractor.getNext(), "name", "HTML@/lang", "content", "cs-cz");
454-
checkExtractHtmlLangAttribute(extractor.getNext(), "name", "HTML@/lang", "content", "en");
455-
checkExtractHtmlLangAttribute(extractor.getNext(), "name", "HTML@/xml:lang", "content", "es-MX");
441+
checkExtractedAttributes(extractor.getNext(), 1, 0, "name", "HTML@/lang", "content", "en");
442+
checkExtractedAttributes(extractor.getNext(), 1, 0, "name", "HTML@/lang", "content", "zh-CN");
443+
checkExtractedAttributes(extractor.getNext(), 1, 0, "name", "HTML@/lang", "content", "cs-cz");
444+
checkExtractedAttributes(extractor.getNext(), 2, 0, "name", "HTML@/lang", "content", "en");
445+
checkExtractedAttributes(extractor.getNext(), 1, 0, "name", "HTML@/xml:lang", "content", "es-MX");
456446
}
457447

458448
public void testBodyMetaElements() throws ResourceParseException, IOException {
459449
String testFileName = "meta-itemprop.warc";
460450
ResourceProducer producer = ProducerUtils.getProducer(getClass().getResource(testFileName).getPath());
461451
ResourceFactoryMapper mapper = new ExtractingResourceFactoryMapper();
462452
ExtractingResourceProducer extractor = new ExtractingResourceProducer(producer, mapper);
463-
checkExtractedAttributes(extractor.getNext(), "name", "robots", "content", "index,follow");
453+
Resource resource = extractor.getNext();
454+
checkExtractedAttributes(resource, 2, 0, "name", "HTML@/lang", "content", "en");
455+
checkExtractedAttributes(resource, 2, 1, "name", "robots", "content", "index,follow");
464456
}
465457

466458
public void testHtmlParserEntityDecoding() {

0 commit comments

Comments
 (0)